DeepMoA: method to predict the mechanism of action of cancer drugs

Select data and import libraries

import sys # we require code from other folders
import pandas as pd
import numpy as np
import itertools
import pickle
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import seaborn as sns
import matplotlib.pyplot as plt
CB_color_cycle = ['#EECC16', '#62BB35', '#FDAE33','#208EA3', '#EA4E9D', '#984ea3','#999999', '#e41a1c', '#dede00']
#sns.set_style("darkgrid")
import matplotlib.font_manager as fm
font_files = fm.findSystemFonts()

plt.rcdefaults()
# Go through and add each to Matplotlib's font cache.
for font_file in font_files:
    fm.fontManager.addfont(font_file)
plt.rc('font', family='Roboto')
plt.rc('font', family='Roboto')

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Roboto'
#%config InlineBackend.figure_format='retina'
# pytorch relates imports
import torch
import torch.nn as nn
import torch.optim as optim

# imports from captum library
from captum.attr import LayerDeepLift
# for combobox
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pd.options.display.min_rows = 20000
pd.set_option('max_colwidth', 200)
pd.options.display.max_rows = 20000
pd.set_option('min_rows', 20000)
mac = "/Users/katyna/Library/CloudStorage/OneDrive-Tecnun/"
windows = "C:/Users/ksada/OneDrive - Tecnun/"
computer = windows # CHANGE
sys.path.append(computer + "SparseGO_code/code")
import util
from util import *
%matplotlib inline
#%matplotlib inline

# To make histograms
def histogram(dataframe, color, title, ylabel,n_bins):
    N, bins, patches = plt.hist(dataframe, color=color,bins=n_bins, linewidth=0.1)

    for i in range(0,len(bins)-1):
        if bins[i]<0.05:
            patches[i].set_facecolor(CB_color_cycle[2])

    plt.xlabel("P-value", fontsize=16)  
    plt.ylabel(ylabel, fontsize=16)
    plt.title(title, fontsize=16)
    plt.xticks(fontsize=14)  
    plt.yticks(fontsize=14)
    ax = plt.subplot(111)  
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False)    
inputdir = computer+"SparseGO_code/data/cross_validation_expression/allsamples/" # CHANGE
dir1=computer+"Tesis/Codigo/VariableImportance/"
dir2=computer+"SparseGO_code/results/weights&biases/Expression_MSE_all/" # CHANGE
resultsdir=dir2
gene2id = inputdir+"gene2ind.txt"
cell2id=inputdir+"cell2ind.txt"
drug2id=inputdir+"drug2ind.txt"
drug2fingerprint=inputdir+"drug2fingerprint.txt"
load=resultsdir+"last_model.pt"

onto = inputdir+"ontology.txt"  # CHANGE 
genotype=inputdir+"cell2expression.txt"  # CHANGE 

num_neurons_per_GO = 6 # CHANGE

DeepLIFT

gene2id_mapping = load_mapping(gene2id)
dG, terms_pairs, genes_terms_pairs = load_ontology(onto, gene2id_mapping)
sorted_pairs, level_list, level_number = sort_pairs(genes_terms_pairs, terms_pairs, dG, gene2id_mapping)
layer_connections = pairs_in_layers(sorted_pairs, level_list, level_number) 

cell_features = np.genfromtxt(genotype, delimiter=',')
drug_features = np.genfromtxt(drug2fingerprint, delimiter=',')

drug2id_mapping = load_mapping(drug2id)
cell2id_mapping = load_mapping(cell2id)

num_genes = len(gene2id_mapping)
drug_dim = len(drug_features[0,:])
There are 15015 genes
There are 1 roots: GO:0008150
There are 4184 terms
There are 1 connected components
model = torch.load(load, map_location='cuda:%d' % 0)
model
sparseGO_nn(
  (genes_terms_sparse_linear_1): SparseLinearNew(
    in_features=15015, out_features=25104, bias=True, sparsity=0.0030196221878822263, connectivity=tensor([[    0,     1,     2,  ..., 23721, 23722, 23723],
            [    0,     0,     0,  ..., 15014, 15014, 15014]], device='cuda:0'), small_world=False
  )
  (genes_terms_tanh): Tanh()
  (genes_terms_batchnorm): BatchNorm1d(25104, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_1): SparseLinearNew(
    in_features=25104, out_features=8304, bias=True, sparsity=0.002372788160788691, connectivity=tensor([[  966,   967,   968,  ...,  7047,  7048,  7049],
            [    0,     0,     0,  ..., 25103, 25103, 25103]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_1): Tanh()
  (GO_terms_batchnorm_1): BatchNorm1d(8304, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_2): SparseLinearNew(
    in_features=8304, out_features=3684, bias=True, sparsity=0.003911619061964564, connectivity=tensor([[   0,    1,    2,  ..., 3681, 3682, 3683],
            [   0,    0,    0,  ..., 8303, 8303, 8303]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_2): Tanh()
  (GO_terms_batchnorm_2): BatchNorm1d(3684, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_3): SparseLinearNew(
    in_features=3684, out_features=1650, bias=True, sparsity=0.007924193070772875, connectivity=tensor([[ 150,  151,  152,  ..., 1641, 1642, 1643],
            [   0,    0,    0,  ..., 3683, 3683, 3683]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_3): Tanh()
  (GO_terms_batchnorm_3): BatchNorm1d(1650, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_4): SparseLinearNew(
    in_features=1650, out_features=726, bias=True, sparsity=0.015807663410969196, connectivity=tensor([[ 474,  475,  476,  ...,  711,  712,  713],
            [   0,    0,    0,  ..., 1649, 1649, 1649]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_4): Tanh()
  (GO_terms_batchnorm_4): BatchNorm1d(726, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_5): SparseLinearNew(
    in_features=726, out_features=318, bias=True, sparsity=0.03305785123966942, connectivity=tensor([[ 60,  61,  62,  ..., 105, 106, 107],
            [  0,   0,   0,  ..., 725, 725, 725]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_5): Tanh()
  (GO_terms_batchnorm_5): BatchNorm1d(318, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_6): SparseLinearNew(
    in_features=318, out_features=120, bias=True, sparsity=0.06981132075471698, connectivity=tensor([[  0,   1,   2,  ...,  93,  94,  95],
            [  0,   0,   0,  ..., 317, 317, 317]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_6): Tanh()
  (GO_terms_batchnorm_6): BatchNorm1d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_7): SparseLinearNew(
    in_features=120, out_features=42, bias=True, sparsity=0.2, connectivity=tensor([[ 18,  19,  20,  ...,  21,  22,  23],
            [  0,   0,   0,  ..., 119, 119, 119]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_7): Tanh()
  (GO_terms_batchnorm_7): BatchNorm1d(42, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (GO_terms_sparse_linear_8): SparseLinearNew(
    in_features=42, out_features=30, bias=True, sparsity=1.0, connectivity=tensor([[ 0,  1,  2,  ..., 27, 28, 29],
            [ 0,  0,  0,  ..., 41, 41, 41]], device='cuda:0'), small_world=False
  )
  (GO_terms_tanh_8): Tanh()
  (GO_terms_batchnorm_8): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drug_linear_layer_1): Linear(in_features=2048, out_features=200, bias=True)
  (drug_tanh_1): Tanh()
  (drug_batchnorm_layer_1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drug_linear_layer_2): Linear(in_features=200, out_features=100, bias=True)
  (drug_tanh_2): Tanh()
  (drug_batchnorm_layer_2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drug_linear_layer_3): Linear(in_features=100, out_features=50, bias=True)
  (drug_tanh_3): Tanh()
  (drug_batchnorm_layer_3): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (final_linear_layer): Linear(in_features=80, out_features=40, bias=True)
  (final_tanh): Tanh()
  (final_batchnorm_layer): BatchNorm1d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (final_aux_linear_layer): Linear(in_features=40, out_features=1, bias=True)
  (final_aux_tanh): Tanh()
  (final_linear_layer_output): Linear(in_features=1, out_features=1, bias=True)
)
# Save layers to be analyzed
model_layers = []
model_layers.append(model.genes_terms_sparse_linear_1)
model_layers.append(model.GO_terms_sparse_linear_1)
model_layers.append(model.GO_terms_sparse_linear_2)
model_layers.append(model.GO_terms_sparse_linear_3)
model_layers.append(model.GO_terms_sparse_linear_4)
model_layers.append(model.GO_terms_sparse_linear_5)
model_layers.append(model.GO_terms_sparse_linear_6)
model_layers.append(model.GO_terms_sparse_linear_7)

GO terms info

# Go term names
gene_ontology = pd.read_excel('all_go_terms_info.xlsx')

Get all layers’ GO term with the neuron number

all_terms_ids = {}
all_terms_names = {}
all_layers_non_virtual = {} # store only terms that are part of the layer (remove virtual), those are the important attribuitions
all_layers_non_virtual_names = {}
num_neurons_per_GO = 6

for layer_number in range(len(layer_connections)-1):
    layer_pairs = layer_connections[layer_number] 
    
    terms_ids = []
    names = []
    output_id = create_index(layer_pairs[:,0]) # first 6 neurons correspond to the term with key 0
    
    for term in output_id.keys():
        #name = gene_ontology.loc[gene_ontology['GO_term'] == term].to_numpy()[0,3].replace("_"," ").capitalize()
        name = gene_ontology.loc[gene_ontology['id'] == term].to_numpy()[0,1].capitalize()
        for i in range(1,num_neurons_per_GO+1): # vector que tiene GO:0000038_1, GO:0000038_2 ... GO:0000038_6 y asi luego concatenar con las attributions
            terms_ids.append(term+"_"+str(i))
            names.append(name+" ("+str(i)+")")
    all_terms_ids[layer_number] = np.array(terms_ids)
    all_terms_names[layer_number] = np.array(names)

    non_virtual = [] # store the terms part of that layer
    non_virtual_names = []
    for term in level_list[layer_number+1]:
        nv_name = gene_ontology.loc[gene_ontology['id'] == term].to_numpy()[0,1].capitalize()
        for i in range(1,7):
            non_virtual.append(term+"_"+str(i))
            non_virtual_names.append(nv_name+" ("+str(i)+")")
    all_layers_non_virtual[layer_number] = non_virtual
    all_layers_non_virtual_names[layer_number] = non_virtual_names

All GO terms part of a layer (non-virtual) with their corresponding name and layer number…

real_go_info = pd.DataFrame({"GO_term":[],"Name":[],"layer_number":[]})
for layer_number in range(len(layer_connections)-1):
    layer_go_info = pd.DataFrame({"GO_term":all_layers_non_virtual[layer_number],"Name":all_layers_non_virtual_names[layer_number],"layer_number":(layer_number)})
    real_go_info = pd.concat((real_go_info,layer_go_info))
real_go_info.head()
GO_term Name layer_number
0 GO:0000019_1 Regulation of mitotic recombination (1) 0.0
1 GO:0000019_2 Regulation of mitotic recombination (2) 0.0
2 GO:0000019_3 Regulation of mitotic recombination (3) 0.0
3 GO:0000019_4 Regulation of mitotic recombination (4) 0.0
4 GO:0000019_5 Regulation of mitotic recombination (5) 0.0

Drugs info

def get_compound_names(file_name):
    compounds = []

    with open(file_name, 'r') as fi:
        for line in fi:
            tokens = line.strip().split('\t')
            compounds.append([tokens[1],tokens[2]])
    return compounds
drugs = get_compound_names(inputdir+"compound_names.txt")
drugs.pop(0)
['SMILE', 'Name']

DeepLIFT for VNN

Reference activation… (baseline)

median_cell_features = np.median(cell_features,axis=0) # to use as a reference
median_drug_features = np.genfromtxt(computer+"SparseGO_code/data/glucose_fingerprint.txt", delimiter=',')

Attribution function: sum

def get_layer_attribution(layer_number,input_data,baseline,selected_drug_data):
    dl = LayerDeepLift(model, model_layers[layer_number],multiply_by_inputs = True) # CHOOSE LAYER TO STUDY
    dl_attr_test = dl.attribute(input_data,baseline)
    dl_attr_test_sum = dl_attr_test.cpu().detach().numpy().sum(0) # se suman las attributions para cada sample
    
    attribution_data = pd.DataFrame(np.column_stack((all_terms_ids[layer_number],dl_attr_test_sum)), columns=["GO_term",selected_drug_data[1]])
    attribution_data[[selected_drug_data[1]]] = attribution_data[[selected_drug_data[1]]].apply(pd.to_numeric).round(10)
    attribution_data = attribution_data.loc[attribution_data['GO_term'].isin(all_layers_non_virtual[layer_number])] # only the keep the non virtual terms
    return attribution_data

DeepLIFT for all drugs

attribution_data_all = pd.DataFrame()
# Obtain the top GO terms on all layers for each drug
for selected_drug_data in drugs:
    selected_drug =selected_drug_data[0] # DRUG smile
    selected_drug_features = []
    drug_specific_features=drug_features[drug2id_mapping[selected_drug]] # features of drug
    
    for i in range(len(cell2id_mapping)): # make all combinations of selected_drug and cell types 
        selected_drug_features.append(np.concatenate((cell_features[i], drug_specific_features), axis=None))
    selected_drug_features = torch.FloatTensor(np.array(selected_drug_features))

    # Data for deeplift...
    input_data = torch.autograd.Variable(selected_drug_features.cuda(0))
    
    #median_drug_features = drug_specific_features
    # baseline is the median of the expression data and drug features 
    baseline = torch.FloatTensor(np.concatenate((median_cell_features, median_drug_features), axis=None))
    baseline = torch.reshape(baseline, (1, baseline.size()[0]))
    baseline = torch.autograd.Variable(baseline.cuda(0))
    
    attribution_data_drug = list(map(get_layer_attribution,range(0,len(model_layers)),itertools.repeat(input_data, len(model_layers)),itertools.repeat(baseline, len(model_layers)),itertools.repeat(selected_drug_data, len(model_layers)))) # get the attribution for each layer (map is similar to apply)
    attribution_data_drug = pd.concat(attribution_data_drug) # concatenate attribution of all layers
    
    attribution_data_all = pd.concat([attribution_data_all,attribution_data_drug.iloc[:,1]], axis=1)
    
    print(selected_drug_data[1])
    
attribution_data_all = pd.concat([attribution_data_drug.iloc[:,0],attribution_data_all], axis=1)
attribution_data_all = attribution_data_all.set_index("GO_term")
attribution_data_all.head()
BRD-K02251932-001-01-3 BRD-K25737009-001-01-2 Nintedanib bicalutamide N-[(2R,3S)-2-[[cyclopropylmethyl(methyl)amino]methyl]-5-[(2R)-1-hydroxypropan-2-yl]-3-methyl-6-oxo-3,4-dihydro-2H-1,5-benzoxazocin-8-yl]-1-methyl-4-imidazolesulfonamide PHA-665752 N-cyclopropyl-3-[3-[[cyclopropyl(oxo)methyl]amino]-1H-indazol-6-yl]benzamide Ki8751 IPA-3 FAWUGYGEBHAQBU-PPEXNQRJSA-N ... ML031 Semagacestat RITA CDK9 inhibitor Dasatinib BMS-536924;CC1=CC(=CC2=C1NC(=C3C(=CC=NC3=O)NC[C@H](C4=CC(=CC=C4)Cl)O)N2)N5CCOCC5 SCHEMBL13741284 Daporinad STF-31 Narciclasine
GO_term
GO:0000012_1 -0.006564 -0.005680 0.003188 -0.005863 -0.003410 -0.002951 0.001118 0.002033 0.000799 -0.007842 ... -0.007256 -0.004271 -0.013783 -0.006253 0.002864 0.009604 -0.008099 -0.001475 -0.003698 -0.009866
GO:0000012_2 0.010029 0.011514 0.009892 0.012072 0.005788 0.012909 0.002316 0.009362 -0.011816 0.000166 ... 0.008918 -0.002449 0.017704 0.006732 0.002447 0.006485 0.003888 -0.000569 0.001628 0.017132
GO:0000012_3 0.008466 0.006840 -0.000027 0.006379 0.003082 -0.006110 -0.008877 -0.000347 -0.013084 0.000150 ... -0.006096 0.011308 0.012216 0.000997 0.011521 0.013800 0.002843 0.016328 0.021640 0.003536
GO:0000012_4 0.013018 0.007276 0.010128 0.008622 0.004795 0.006706 0.000874 0.005514 -0.003347 -0.000010 ... -0.003682 0.006544 0.010806 0.003346 0.017556 0.023130 0.001105 0.009710 0.016940 0.014787
GO:0000012_5 -0.007076 -0.006129 -0.007634 -0.003785 -0.004151 -0.007947 -0.008430 -0.006039 -0.002722 0.002163 ... 0.001821 -0.002346 -0.007831 -0.009368 -0.011118 -0.003408 -0.001760 0.003593 -0.000109 -0.020831

5 rows × 684 columns

ChEMBL Drug Target Slim

from chembl_webresource_client.new_client import new_client

Import SparseGO drugs

# Get names 
def get_compound_names(file_name):
    compounds = []

    with open(file_name, 'r') as fi:
        for line in fi:
            tokens = line.strip().split('\t')
            compounds.append(tokens[2].lower())

    return compounds
names = get_compound_names(computer+"SparseGO_code/data/compound_names.txt")
names.pop(0)

chEML IDs

Get chembl IDs of drugs if available (there are always 684 drugs, the compounds2ids object can be reused)

# Get all chembl IDs -- tarda
molecule = new_client.molecule

compounds2ids = {}
for i,drug in enumerate(names):
    
    if " + " in drug:
        drug_split = drug.split(" + ", 1)
        ID1 = list(molecule.filter(pref_name__iexact=drug_split[0]).only('molecule_chembl_id'))
        ID2 = list(molecule.filter(pref_name__iexact=drug_split[1]).only('molecule_chembl_id'))
        if len(ID1)>0 and len(ID2)>0:
            compounds2ids[drug]=[ID1[0]['molecule_chembl_id'],ID2[0]['molecule_chembl_id']]
        elif len(ID1)>0:
            compounds2ids[drug]=ID1[0]['molecule_chembl_id'] 
        elif len(ID2)>0:
            compounds2ids[drug]=ID2[0]['molecule_chembl_id'] 
        else:
            print(drug,i)

    else:
        ID = list(molecule.filter(pref_name__iexact=drug).only('molecule_chembl_id'))
        if len(ID)>0:
            ID = ID[0]['molecule_chembl_id']
            compounds2ids[drug]=ID
        else:
            # for drugs that have the chembl ID as the name!!
            ID = list(molecule.filter(chembl_id=drug).only('molecule_chembl_id')) 
            if len(ID)>0:
                ID = ID[0]['molecule_chembl_id']
                compounds2ids[drug]=ID
            else:
                # in case it is not found by pref_name
                ID = list(molecule.filter(molecule_synonyms__molecule_synonym__iexact=drug).only('molecule_chembl_id'))
                if len(ID)>0:
                    ID = ID[0]['molecule_chembl_id']
                    compounds2ids[drug]=ID
                else:
                    print(drug,i)
    # 341 chembl IDs where found (october 31 2022)
#manually add 6 more
compounds2ids["teniposide [usan]"]="CHEMBL452231"
compounds2ids["docetaxel (taxotere)"]="CHEMBL92"
compounds2ids["nan + navitoclax(1)"]="CHEMBL443684"
compounds2ids["nan + navitoclax(2)"]="CHEMBL443684"
compounds2ids["osi-027;coc1=cc=cc2=cc(=c3c4=c(n=cnn4c(=n3)c5ccc(cc5)c(=o)o)n)n=c21"]="CHEMBL3120215"
compounds2ids["paclitaxel;cc1=c2[c@h](c(=o)[c@@]3([c@h](c[c@@h]4[c@]([c@h]3[c@@h]([c@@](c2(c)c)(c[c@@h]1oc(=o)[c@@h]([c@h](c5=cc=cc=c5)nc(=o)c6=cc=cc=c6)o)o)oc(=o)c7=cc=cc=c7)(co4)oc(=o)c)o)c)oc(=o)c"]="CHEMBL428647"
len(compounds2ids)
347

chEMBL MoA (targets)

Get the molecule targets of each drug (if available)

compounds2targets = dict() # required to store the drug targets 
for drug in compounds2ids.keys():
    compounds2targets[drug] = set()

chembl_ids = list(compounds2ids.values()) # Chembl IDs of drugs

for drug in compounds2ids:
    # we jump from compounds to targets through activities:
    activities = new_client.mechanism.filter(parent_molecule_chembl_id__in=compounds2ids[drug]).only(
        ['parent_molecule_chembl_id', 'target_chembl_id'])
    # extracting target ChEMBL IDs from activities:
    for act in activities:
        compounds2targets[drug].add(act['target_chembl_id'])
    print(drug)
# We now know all targets for some drug
compounds2targets = {k: v for k, v in compounds2targets.items() if len(v) != 0 and len([x for x in list(v) if x is not None]) != 0 }
# 218 DRUGS HAVE ANNOTATED DRUG TARGETS
len(compounds2targets)
220

Drug slim GO terms

Get the GO terms of each target

# Get the GO terms of each target
compounds_GOterms = {}
for i in range(0, len(compounds2targets.keys())):
    compound = list(compounds2targets.keys())[i]
    GOterms_list = []
    
    for j in range(0, len(list(compounds2targets[compound]))):   
        target = list(compounds2targets[compound])[j]
        all_cross_references = list(new_client.target.filter(target_chembl_id=target).only(['target_components']).only(['target_components_xrefs']))[0]['target_components']
        if len(all_cross_references)>0: # not all targets have annotated go_terms
            for i in range(0, len(all_cross_references)):
                GOterms = all_cross_references[i]
                GOterms = pd.DataFrame(GOterms['target_component_xrefs'])
                GOterms = pd.concat([GOterms,pd.Series([target]).repeat(len(GOterms)).reset_index().pop(0)],axis=1) # add target ID to dataframe 
                GOterms_list= GOterms_list + GOterms.values.tolist()
    
    compounds_GOterms[compound] =  pd.DataFrame(GOterms_list).drop_duplicates()
    print(compound)
len(compounds_GOterms)
220
# we have 206 annotated drugs on CHEMBL
# add GO terms found in CTRPv2
CTRPv2_terms = pd.read_excel('ctrp_goterms_drugs.xlsx')  
# add GO terms of drugs with or without annotations
for drug in CTRPv2_terms["Drug"].unique():
    if drug not in list(compounds_GOterms.keys()): # some drugs had no previous data, no annotations from chembl
        compounds_GOterms[drug] = pd.DataFrame() # create empty dataframe

    for term in list(CTRPv2_terms.loc[CTRPv2_terms["Drug"]==drug]["Field"]):
        compounds_GOterms[drug] = pd.concat([compounds_GOterms[drug],pd.DataFrame([term,"","GoProcess",""]).transpose()])
    compounds_GOterms[drug] = compounds_GOterms[drug].drop_duplicates() 

# now we have 233 annotated drugs
# Delete drugs with no GOterms (some targets have no annotated GO terms)
compounds_GOterms = {k: v for k, v in compounds_GOterms.items() if len(v) != 0 } 
len(compounds_GOterms)
236

Match GO terms

Find all terms that match, terms that are part of both, the sparseGO graph and the drug slim results…

def load_ontology_extra_output(ontology_file, gene2id_mapping):
    """
    Creates the directed graph of the GO terms and stores the connected elements in arrays.

        Output
        ------
        dG: networkx.classes.digraph.DiGraph
            Directed graph of all terms

        terms_pairs: numpy.ndarray
            Store the connection between a term and a term

        genes_terms_pairs: numpy.ndarray
            Store the connection between a gene and a term
    """

    dG = nx.DiGraph() # Directed graph class

    file_handle = open(ontology_file) #  Open the file that has genes and go terms

    terms_pairs = [] # store the pairs between a term and a term
    genes_terms_pairs = [] # store the pairs between a gene and a term

    gene_set = set() # create a set (elements can't repeat)
    term_direct_gene_map = {}
    term_size_map = {}


    for line in file_handle:

        line = line.rstrip().split() # delete spaces and transform to list, line has 3 elements

        # No me hace falta el if, no tengo que separar las parejas
        if line[2] == 'default': # si el tercer elemento es default entonces se conectan los terms en el grafo
            dG.add_edge(line[0], line[1]) # Add an edge between line[0] and line[1]
            terms_pairs.append([line[0], line[1]]) # Add the pair to the list
        else:
            if line[1] not in gene2id_mapping: # se salta el gen si no es parte de los que estan en gene2id_mapping
                print(line[1])
                continue

            genes_terms_pairs.append([line[0], line[1]]) # add the pair

            if line[0] not in term_direct_gene_map: # si el termino todavia no esta en el diccionario lo agrega
                term_direct_gene_map[ line[0] ] = set() # crea un set

            term_direct_gene_map[line[0]].add(gene2id_mapping[line[1]]) # añadimos el gen al set de ese term

            gene_set.add(line[1]) # añadimos el gen al set total de genes

    terms_pairs = np.array(terms_pairs) # convert to 2d array
    genes_terms_pairs = np.array(genes_terms_pairs) # convert to 2d array

    file_handle.close()

    print('There are', len(gene_set), 'genes')

    for term in dG.nodes(): # hacemos esto para cada uno de los GO terms

        term_gene_set = set() # se crea un set

        if term in term_direct_gene_map:
            term_gene_set = term_direct_gene_map[term] # genes conectados al term

        deslist = nxadag.descendants(dG, term) #regresa todos sus GO terms descendientes (biological processes tiene 2085 descendientes, todos menos el mismo)

        for child in deslist:
            if child in term_direct_gene_map: # añadir los genes de sus descendientes
                term_gene_set = term_gene_set | term_direct_gene_map[child] # union of both sets, ahora tiene todos los genes los suyos y los de sus descendientes

        if len(term_gene_set) == 0:
            print('There is empty terms, please delete term:', term)
            sys.exit(1)
        else:
            # por ahora esta variable no me hace falta
            term_size_map[term] = len(term_gene_set) # cantidad de genes en ese term  (tomando en cuenta sus descendientes)

    leaves = [n for n in dG.nodes if dG.in_degree(n) == 0] # buscar la raiz
    #leaves = [n for n,d in dG.in_degree() if d==0]

    uG = dG.to_undirected() # Returns an undirected representation of the digraph
    connected_subG_list = list(nxacc.connected_components(uG)) #list of all GO terms

    # Verify my graph makes sense...
    print('There are', len(leaves), 'roots:', leaves[0])
    print('There are', len(dG.nodes()), 'terms')
    print('There are', len(connected_subG_list), 'connected components')
    if len(leaves) > 1:
        print('There are more than 1 root of ontology. Please use only one root.')
        sys.exit(1)
    if len(connected_subG_list) > 1:
        print( 'There are more than connected components. Please connect them.')
        sys.exit(1)

    return dG, terms_pairs, genes_terms_pairs, term_direct_gene_map, term_size_map

SparseGO graph

# Import SparseGO graph (to extract all nodes/terms)... 

# Load ontology: create the graph of connected GO terms
dG, terms_pairs, genes_terms_pairs, term_direct_gene_map, term_size_map = load_ontology_extra_output(onto, gene2id_mapping)
####
sparseGO_terms = list(dG.nodes())
sparseGO_terms.remove("GO:0008150")
There are 15015 genes
There are 1 roots: GO:0008150
There are 4184 terms
There are 1 connected components

Full GO graph

# Import full graph (to find parents)...
import obonet
#import networkx as nx
url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
full_graph = obonet.read_obo(url)
full_graph = full_graph.reverse() # change the direction of nodes
[n for n in full_graph.nodes if full_graph.in_degree(n) == 0] # graph contains the 3 roots (BP,MF,CC)
['GO:0003674', 'GO:0005575', 'GO:0008150']

Match terms!

Find all terms that match, terms that are part of both, the sparseGO graph and the drug slim results… if the slim terms’ ascendants are a match, they are also added

# Each model has DIFFERENT matches (the graph is different)
compounds_GOterms_matches = {}
for drug in compounds_GOterms.keys():
    # choose drug
    drug_df = compounds_GOterms[drug]
    drug_slim_GOterms = set(drug_df.loc[drug_df[2] == "GoProcess"][0]) # only GO processes
    #set(sparseGO_terms) & set(drug_slim_GOterms)
    drug_matches = [] # store all directly matched terms and matches with all parents
    for term in drug_slim_GOterms: # term ='GO:1902669' # buen ejemplo 
        
        if term in sparseGO_terms: # is the term in the sparseGO terms? 
            drug_matches.append([1,term]) # add to list
            #1: same term, 2:not direct match  (esto igual despues...the number indicates how direct is the relationship 0:same term, 1: parent, 2: grandpa, 3:...)
        
        # are its ascendants in the sparseGO terms? 
        parents = [source for source, _ in  full_graph.in_edges(term)] # parents of term
        relationship = 2
        while(len(parents)>0): # check all ascendants 
            #relationship+=1
            parents = [source for source, _ in  full_graph.in_edges(parents)] # parents of parents
        
            for parent_term in parents: # add parents that match sparseGO terms 
                if parent_term in sparseGO_terms:
                    drug_matches.append([relationship, parent_term])
        
        drug_matches = (pd.DataFrame(drug_matches).drop_duplicates()).values.tolist() # remove duplicates
        compounds_GOterms_matches[drug] = drug_matches
    print(drug)
# delete drugs that have no matches
compounds_GOterms_matches = {i:j for i,j in compounds_GOterms_matches.items() if j != []}
len(compounds_GOterms_matches)
230

SparseGO terms x drugSlim terms matrix

attribution_data_all.columns = attribution_data_all.columns.str.lower() # in order to match the term
attribution_data_all.head()
brd-k02251932-001-01-3 brd-k25737009-001-01-2 nintedanib bicalutamide n-[(2r,3s)-2-[[cyclopropylmethyl(methyl)amino]methyl]-5-[(2r)-1-hydroxypropan-2-yl]-3-methyl-6-oxo-3,4-dihydro-2h-1,5-benzoxazocin-8-yl]-1-methyl-4-imidazolesulfonamide pha-665752 n-cyclopropyl-3-[3-[[cyclopropyl(oxo)methyl]amino]-1h-indazol-6-yl]benzamide ki8751 ipa-3 fawugygebhaqbu-ppexnqrjsa-n ... ml031 semagacestat rita cdk9 inhibitor dasatinib bms-536924;cc1=cc(=cc2=c1nc(=c3c(=cc=nc3=o)nc[c@h](c4=cc(=cc=c4)cl)o)n2)n5ccocc5 schembl13741284 daporinad stf-31 narciclasine
GO_term
GO:0000012_1 -0.006564 -0.005680 0.003188 -0.005863 -0.003410 -0.002951 0.001118 0.002033 0.000799 -0.007842 ... -0.007256 -0.004271 -0.013783 -0.006253 0.002864 0.009604 -0.008099 -0.001475 -0.003698 -0.009866
GO:0000012_2 0.010029 0.011514 0.009892 0.012072 0.005788 0.012909 0.002316 0.009362 -0.011816 0.000166 ... 0.008918 -0.002449 0.017704 0.006732 0.002447 0.006485 0.003888 -0.000569 0.001628 0.017132
GO:0000012_3 0.008466 0.006840 -0.000027 0.006379 0.003082 -0.006110 -0.008877 -0.000347 -0.013084 0.000150 ... -0.006096 0.011308 0.012216 0.000997 0.011521 0.013800 0.002843 0.016328 0.021640 0.003536
GO:0000012_4 0.013018 0.007276 0.010128 0.008622 0.004795 0.006706 0.000874 0.005514 -0.003347 -0.000010 ... -0.003682 0.006544 0.010806 0.003346 0.017556 0.023130 0.001105 0.009710 0.016940 0.014787
GO:0000012_5 -0.007076 -0.006129 -0.007634 -0.003785 -0.004151 -0.007947 -0.008430 -0.006039 -0.002722 0.002163 ... 0.001821 -0.002346 -0.007831 -0.009368 -0.011118 -0.003408 -0.001760 0.003593 -0.000109 -0.020831

5 rows × 684 columns

attribution_data_all.shape
(25098, 684)

Only keep drugs that have annotated GO terms

attribution_data_annotated = attribution_data_all[list(compounds_GOterms_matches.keys())]
attribution_data_annotated.shape # 230 DRUGS
(25098, 230)

Build drugSlim (MoA) matrix

slim_matrix = attribution_data_annotated.copy() # copy dataframe in order to build a similar matrix 
for col in slim_matrix.columns:
    slim_matrix[col].values[:] = 0 # empty matrix 
for drug in compounds_GOterms_matches.keys():
    drug_matches = compounds_GOterms_matches[drug]
    drug_matches_names = list(pd.DataFrame(drug_matches)[1])
    
    drug_matches_names_duplicated = []
    for term in set(drug_matches_names):
        for i in range(1,7):
            drug_matches_names_duplicated.append(term+"_"+str(i))
    
    slim_matrix[drug][drug_matches_names_duplicated] = 1 # add a 1 if term is annotated to drug

SVM

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import svm
slim_matrix_single_neuron = pd.DataFrame(0, index=sparseGO_terms, columns=slim_matrix.columns)
preds_svm_matrix = pd.DataFrame(0, index=sparseGO_terms, columns=slim_matrix.columns)
platt_matrix = pd.DataFrame(0, index=sparseGO_terms, columns=slim_matrix.columns)
distance_matrix = pd.DataFrame(0, index=sparseGO_terms, columns=slim_matrix.columns)
delta_logits_matrix  = pd.DataFrame(0, index=sparseGO_terms, columns=slim_matrix.columns)

Create models

Regression models…

# Dictionaries to store results
GO_terms_auc_svm = {}
GO_terms_aupr_svm = {}
GO_terms_precision_svm = {}
GO_terms_auc_delta_logits = {}

# Perform logistic
for goterm in sparseGO_terms:
    # if (real_go_info[real_go_info["GO_term"]==goterm+"_1"]["layer_number"]).values >3:
    #     continue
    
    # store results of each cross validation
    all_y_test = []
    all_y_pred_proba = []
    all_y_pred_proba_dis = []
    all_y_pred = []
    all_y_names = []

    goterm_drugs = slim_matrix.loc[[goterm+"_"+str(1)]].values.flatten()

    if sum(goterm_drugs) <= 8: # at least 2 annotated drugs in each group
            continue

    list_nodes = []
    for i in range(1,7):
        list_nodes.append(goterm+"_"+str(i))

    score = attribution_data_annotated.loc[list_nodes].T
    #score_mod = score
    score_mod = score.divide(score.std()).fillna(0) # AFECTA MUCHO

    # Separate drugs in 4 groups for cross-validation -----

    # Split data in 2 groups (with train_test_split in order to have 0s in both groups)
    X_part1,X_part2,y_part1,y_part2=train_test_split(score_mod,goterm_drugs,test_size=0.50,random_state=0,stratify=goterm_drugs)
    # Split data again in 4 groups (split data previously split)
    X_group1,X_group2,y_group1,y_group2=train_test_split(X_part1,y_part1,test_size=0.50,random_state=0,stratify=y_part1)
    X_group3,X_group4,y_group3,y_group4=train_test_split(X_part2,y_part2,test_size=0.50,random_state=0,stratify=y_part2)

    for i in  range(1,5):
        vector = range(0,5)
        group_number = str(i)
        X_test = globals()["X_group"+group_number]
        y_test = globals()["y_group"+group_number]

        # Use the other 3 groups for training 
        keep = list({1,2,3,4}-{int(group_number)}) # remove group number of current test 
        X_train = pd.concat((globals()["X_group"+str(keep[0])],globals()["X_group"+str(keep[1])],globals()["X_group"+str(keep[2])]))
        y_train = np.concatenate((globals()["y_group"+str(keep[0])],globals()["y_group"+str(keep[1])],globals()["y_group"+str(keep[2])]))
        
        #gamma = 1/(X_train.shape[1]*X_train.to_numpy().var())
        gamma = "scale"
        C=1
        
        svm_model = svm.SVC(C=C,gamma=gamma, kernel='rbf',
                           class_weight="balanced",
                            tol=0.001,
                            probability=True,
                            random_state=1234)
        # svm_model = svm.SVC(gamma='auto', kernel='rbf',class_weight="balanced",probability=True)
        
        # fit the model with data
        svm_model.fit(X_train,y_train)
        y_pred=svm_model.predict(X_test)
        y_pred_proba = svm_model.predict_proba(X_test)[::,1] # platt values
        y_pred_proba_dis = svm_model.decision_function(X_test) # An SVM returns a real-valued prediction for each of the input data samples, which corresponds to its distance from the separating hyperplane.
        #  decision_function SORTS the results from most probable class to the least probable one.
        
        all_y_test.append(y_test)
        all_y_pred_proba.append(y_pred_proba)
        all_y_pred_proba_dis.append(y_pred_proba_dis)
        all_y_pred.append(y_pred)
        all_y_names.append(X_test.index)

    all_y_test = np.concatenate(all_y_test)
    all_y_pred_proba = np.concatenate(all_y_pred_proba)
    all_y_pred_proba_dis = np.concatenate(all_y_pred_proba_dis)
    all_y_names = np.concatenate(all_y_names)
    all_y_pred = np.concatenate(all_y_pred)
    
    percentage_go_annotations = sum(all_y_test)/len(all_y_test)
    logits_apriori=np.log(percentage_go_annotations/(1-percentage_go_annotations))
    logits_apost= np.log(all_y_pred_proba/(1-all_y_pred_proba))
    delta_logits = logits_apost-logits_apriori

    platt_matrix.loc[goterm,all_y_names] = all_y_pred_proba
    distance_matrix.loc[goterm,all_y_names] = all_y_pred_proba_dis
    slim_matrix_single_neuron.loc[goterm,all_y_names] = all_y_test
    preds_svm_matrix.loc[goterm,all_y_names] = all_y_pred
    
    delta_logits_matrix.loc[goterm,all_y_names] = delta_logits

    GO_terms_auc_delta_logits[goterm] = metrics.roc_auc_score(all_y_test, delta_logits)
    GO_terms_auc_svm[goterm] = metrics.roc_auc_score(all_y_test, all_y_pred_proba)

    precision, recall, thresholds = metrics.precision_recall_curve(all_y_test, all_y_pred_proba)
    GO_terms_aupr_svm[goterm] = metrics.auc(recall, precision)
    GO_terms_precision_svm[goterm] = metrics.precision_score(all_y_test, all_y_pred)
# done with platt values
GO_terms_auc_svm_df = pd.DataFrame(list(GO_terms_auc_svm.items()),columns = ['goterm','auc']).set_index("goterm")
GO_terms_auc_svm_df = GO_terms_auc_svm_df.dropna()
GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=False).head()
auc
goterm
GO:0036289 0.999708
GO:0060440 0.994743
GO:0042149 0.971292
GO:1902455 0.969545
GO:0001556 0.965979
print("There are " +str(len(GO_terms_auc_svm_df))+ " svm models.")
There are 939 svm models.
# only keep goterms that have a model 
platt_matrix = platt_matrix.loc[list(GO_terms_auc_svm_df.index),:]
distance_matrix = distance_matrix.loc[list(GO_terms_auc_svm_df.index),:]
slim_matrix_single_neuron  = slim_matrix_single_neuron.loc[list(GO_terms_auc_svm_df.index),:]
preds_svm_matrix  = preds_svm_matrix.loc[list(GO_terms_auc_svm_df.index),:]
delta_logits_matrix  = delta_logits_matrix.loc[list(GO_terms_auc_svm_df.index),:]

AUC histogram

sns.set(rc={'figure.figsize':(10,6)})
fig, ax = plt.subplots()
perc = str(round((100*len(GO_terms_auc_svm_df[GO_terms_auc_svm_df["auc"]>0.69])/len(GO_terms_auc_svm_df)),2))+"%"
N, bins, patches = plt.hist(GO_terms_auc_svm_df, color=CB_color_cycle[6],bins=50, linewidth=0.1)

for i in range(0,len(bins)-1):
    if bins[i]>0.69:
        patches[i].set_facecolor(CB_color_cycle[2])

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

# Second, remove the ticks as well.
ax.tick_params(bottom=False, left=True)

# Third, add a horizontal grid (but keep the vertical grid hidden).
# Color the lines a light gray as well.
ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)

plt.xlabel("AUC value", fontsize=20)
plt.ylabel("Number of GO term models", fontsize=20)
colors2 = {'GO term models with AUC>=0.7':CB_color_cycle[2]}  
labels = list(colors2.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors2[label]) for label in labels]
plt.legend(handles, labels,fontsize=20, loc="lower left", bbox_to_anchor=(0.35,-0.35))
plt.text(0.71, 8, str(perc), fontsize=20,color='#333333')
plt.title("Overall performance of the models using expression", fontsize=24)
# con el que mejor funciona es con la suma normal del attribution 
fig.tight_layout()
fig.savefig(resultsdir+'modelsAUCsvm.png', transparent=True)

AUC waterfall plot

GO_terms_auc_svm_df =GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=False)
plt.rcParams['figure.figsize'] = (12, 9)
drugs = GO_terms_auc_svm_df.index
rhos = GO_terms_auc_svm_df["auc"]

percentage = round((sum(rhos>0.69)/len(rhos))*100,1)

fig, ax = plt.subplots()
#colors = ['#208EA3' if (x < 0.5) else '#A4C61A' for x in rhos ]
colors = ['#C9C9C9' if (x < 0.69) else "#6492CA" for x in rhos ]
ax.bar(
    x=drugs,
    height=rhos,
    edgecolor=colors,
    linewidth=2
)
plt.xticks([])
plt.yticks(fontsize=28)


# First, let's remove the top, right and left spines (figure borders)
# which really aren't necessary for a bar chart.
# Also, make the bottom spine gray instead of black.
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
#ax.spines['bottom'].set_color('#DDDDDD')

# Second, remove the ticks as well.
ax.tick_params(bottom=False, left=False)

# Third, add a horizontal grid (but keep the vertical grid hidden).
# Color the lines a light gray as well.
ax.set_axisbelow(False)
ax.yaxis.grid(False)
#ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)


# Add labels and a title. Note the use of `labelpad` and `pad` to add some
# extra space between the text and the tick labels.
ax.set_xlabel('SVM models', labelpad=-30, color='#333333',fontsize=50)
ax.set_ylabel('AUC-ROC value', labelpad=15, color='#333333',fontsize=50)
ax.set_title('', color='#333333',
             weight='bold')

colors2 = {'High confidence drugs (r>0.5)':'#A4C61A'}  
labels = list(colors2.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors2[label]) for label in labels]
#plt.legend(handles, labels,fontsize=40, loc="lower left",bbox_to_anchor=(0, -0.215))
plt.text(77, 0.32, str(percentage)+"%", fontsize=60,color='#000000')

plt.ylim((-0.1,1.1))
# Make the chart fill out the figure better.
fig.tight_layout()
fig.savefig(resultsdir+'WaterfallModelsSVM.png', transparent=True)

AUC boxplot by parents

# Add number of parents
number_parents = {}
levels = {}
for i in range(0,len(GO_terms_auc_svm_df.index)):
    term = GO_terms_auc_svm_df.index[i]
    number_parents[GO_terms_auc_svm_df.index[i]]=len([source for source, _ in  dG.in_edges(term)])
    levels[GO_terms_auc_svm_df.index[i]]=level_number[term]-1
levels = pd.DataFrame.from_dict(levels, orient='index')
number_parents = pd.DataFrame.from_dict(number_parents, orient='index')

GO_terms_auc_svm_df = pd.concat([GO_terms_auc_svm_df, levels,number_parents], axis=1)
GO_terms_auc_svm_df.columns = ["auc","levels","parents"]
GO_terms_auc_svm_df.head()
auc levels parents
GO:0000077 0.284021 1 3
GO:0045737 0.835954 0 8
GO:0000082 0.732331 2 4
GO:1900087 0.593301 0 10
GO:2000134 0.865329 1 9
import plotly.express as px

c = ['#E8384F', '#FD817D', '#FDAE33',
         '#EECC16', '#A4C61A', '#37A862',"#208EA3","#3B6EAB"]

df = px.data.tips()
fig = px.box(GO_terms_auc_svm_df, x="levels", y="auc",
             color="levels",
            color_discrete_sequence=c,
             width =600,
             height=400,
              template="simple_white",
              labels=dict(levels="Level of GO hierarchy", auc="AUC-ROC")
            )
fig.update_traces(width=0.9)

fig.add_shape( # add a horizontal "target" line
    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
    x0=0, x1=1, xref="paper", y0=0.7, y1=0.7, yref="y"
)


fig.update_layout(
   title=dict(text="<b> AUC value grouped by level of GO hierarchy <b>",
             x=0.5,
             y=0.9,
              font=dict(size=18),
              xanchor='center',
              yanchor='top'),
    xaxis=dict(ticks="", showticklabels=False, showgrid=False, zeroline=False),
    yaxis=dict(ticks="", showticklabels=True, showgrid=True, zeroline=False),
 #   yaxis_range=[min(yy.flatten()),max(yy.flatten())],
  #  xaxis_range=[min(xx.flatten()),max(xx.flatten())],
    legend=dict(x=1.1, y=1, orientation="v",font=dict(size=11)),
    paper_bgcolor='rgba(0,0,0,0)',
    font=dict(family='Roboto',color= "#36382E",size=15)
    )

fig.show()

TOP 15 PREDICTED GO TERMS

top15goterms= np.array(GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=False)[0:15].index)

Get Top GO term names

top15goterms_1 = []
for goterm in top15goterms:
    top15goterms_1.append(goterm+"_"+str(1))
real_go_info_mod_best = real_go_info[real_go_info.GO_term.isin(top15goterms_1)]
real_go_info_mod_best.GO_term = real_go_info_mod_best.GO_term.str.replace("_1","")
top15goterms_auc = GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=False)[0:15].reset_index()
top15goterms_auc.columns=["GO_term","auc","levels","parents"]
top15goterms_auc = top15goterms_auc.merge(real_go_info_mod_best[real_go_info_mod_best["GO_term"].isin(top15goterms)], on="GO_term")
top15goterms_auc
GO_term auc levels parents Name layer_number
0 GO:0036289 0.999708 0 2 Peptidyl-serine autophosphorylation (1) 0.0
1 GO:0060440 0.994743 0 4 Trachea formation (1) 0.0
2 GO:0042149 0.971292 0 1 Cellular response to glucose starvation (1) 0.0
3 GO:1902455 0.969545 0 2 Negative regulation of stem cell population maintenance (1) 0.0
4 GO:0001556 0.965979 0 6 Oocyte maturation (1) 0.0
5 GO:0045636 0.955115 0 6 Positive regulation of melanocyte differentiation (1) 0.0
6 GO:0010750 0.955000 0 4 Positive regulation of nitric oxide mediated signal transduction (1) 0.0
7 GO:0060020 0.949434 0 1 Bergmann glial cell differentiation (1) 0.0
8 GO:1902042 0.945804 0 4 Negative regulation of extrinsic apoptotic signaling pathway via death domain receptors (1) 0.0
9 GO:1902236 0.941667 0 12 Negative regulation of endoplasmic reticulum stress-induced intrinsic apoptotic signaling pathway (1) 0.0
10 GO:0070059 0.936432 1 2 Intrinsic apoptotic signaling pathway in response to endoplasmic reticulum stress (1) 1.0
11 GO:0051453 0.935521 1 2 Regulation of intracellular ph (1) 1.0
12 GO:0042659 0.931364 0 3 Regulation of cell fate specification (1) 0.0
13 GO:0006360 0.930046 2 7 Transcription by rna polymerase i (1) 2.0
14 GO:0006959 0.921730 2 2 Humoral immune response (1) 2.0

WORST 15 PREDICTED GO TERMS

worst15goterms= np.array(GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=True)[0:15].index)

Get Worst GO term names

worst15goterms_1 = []
for goterm in worst15goterms:
    worst15goterms_1.append(goterm+"_"+str(1))
real_go_info_mod_worst = real_go_info[real_go_info.GO_term.isin(worst15goterms_1)]
real_go_info_mod_worst.GO_term = real_go_info_mod_worst.GO_term.str.replace("_1","")
worst15goterms_auc = GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=True)[0:15].reset_index()
worst15goterms_auc.columns=["GO_term","auc","levels","parents"]
worst15goterms_auc.merge(real_go_info_mod_worst[real_go_info_mod_worst["GO_term"].isin(worst15goterms)], on="GO_term")
GO_term auc levels parents Name layer_number
0 GO:0000077 0.284021 1 3 Dna damage checkpoint signaling (1) 1.0
1 GO:0006869 0.299648 3 2 Lipid transport (1) 3.0
2 GO:0051302 0.314545 1 2 Regulation of cell division (1) 1.0
3 GO:0016485 0.318636 3 5 Protein processing (1) 3.0
4 GO:0019722 0.322272 2 1 Calcium-mediated signaling (1) 2.0
5 GO:0046854 0.326276 1 2 Phosphatidylinositol phosphate biosynthetic process (1) 1.0
6 GO:0060740 0.331825 1 6 Prostate gland epithelium morphogenesis (1) 1.0
7 GO:0060444 0.347273 1 8 Branching involved in mammary gland duct morphogenesis (1) 1.0
8 GO:0006919 0.352725 1 3 Activation of cysteine-type endopeptidase activity involved in apoptotic process (1) 1.0
9 GO:0032436 0.353421 1 14 Positive regulation of proteasomal ubiquitin-dependent protein catabolic process (1) 1.0
10 GO:0055119 0.353947 1 1 Relaxation of cardiac muscle (1) 1.0
11 GO:0001892 0.355979 1 5 Embryonic placenta development (1) 1.0
12 GO:0031295 0.364518 0 8 T cell costimulation (1) 0.0
13 GO:0046620 0.365476 1 3 Regulation of organ growth (1) 1.0
14 GO:0008361 0.367423 2 1 Regulation of cell size (1) 2.0

AUPR histogram

GO_terms_aupr_svm_df = pd.DataFrame(list(GO_terms_aupr_svm.items()),columns = ['goterm','aupr']).set_index("goterm")
GO_terms_aupr_svm_df = GO_terms_aupr_svm_df.dropna()
GO_terms_aupr_svm_df.sort_values(by=["aupr"], ascending=False).head()
aupr
goterm
GO:0036289 0.996209
GO:0006807 0.945077
GO:0050896 0.921869
GO:0043170 0.909722
GO:0009058 0.900903
# Add number of parents
number_parents = {}
levels = {}
for i in range(0,len(GO_terms_aupr_svm_df.index)):
    term = GO_terms_aupr_svm_df.index[i]
    number_parents[GO_terms_aupr_svm_df.index[i]]=len([source for source, _ in  dG.in_edges(term)])
    levels[GO_terms_aupr_svm_df.index[i]]=level_number[term]-1
levels = pd.DataFrame.from_dict(levels, orient='index')
number_parents = pd.DataFrame.from_dict(number_parents, orient='index')

GO_terms_aupr_svm_df = pd.concat([GO_terms_aupr_svm_df, levels,number_parents], axis=1)
GO_terms_aupr_svm_df.columns = ["aupr","levels","parents"]
c = ['#E8384F', '#FD817D', '#FDAE33',
         '#EECC16', '#A4C61A', '#37A862',"#208EA3","#3B6EAB"]

df = px.data.tips()
fig = px.box(GO_terms_aupr_svm_df, x="levels", y="aupr",
             color="levels",
            color_discrete_sequence=c,
             width =600,
             height=400,
              template="simple_white",
              labels=dict(levels="Level of GO hierarchy", aupr="AUPR")
            )
fig.update_traces(width=0.9)

fig.add_shape( # add a horizontal "target" line
    type="line", line_color="salmon", line_width=3, opacity=1, line_dash="dot",
    x0=0, x1=1, xref="paper", y0=0.7, y1=0.7, yref="y"
)


fig.update_layout(
   title=dict(text="<b> AUPR value grouped by level of GO hierarchy <b>",
             x=0.5,
             y=0.9,
              font=dict(size=18),
              xanchor='center',
              yanchor='top'),
    xaxis=dict(ticks="", showticklabels=False, showgrid=False, zeroline=False),
    yaxis=dict(ticks="", showticklabels=True, showgrid=True, zeroline=False),
 #   yaxis_range=[min(yy.flatten()),max(yy.flatten())],
  #  xaxis_range=[min(xx.flatten()),max(xx.flatten())],
    legend=dict(x=1.1, y=1, orientation="v",font=dict(size=11)),
    paper_bgcolor='rgba(0,0,0,0)',
    font=dict(family='Roboto',color= "#36382E",size=15)
    )

fig.show()
pio.write_image(fig, resultsdir+"AUPR_levels.png", width=600, height=400,scale=8)

Example prediction

def f2(goterm):    
    return goterm
combobox_go = interactive(f2, goterm=widgets.Combobox(options=list(GO_terms_auc_svm_df.sort_values(by=["auc"], ascending=False).index)))

Choose drug to study…

display(combobox_go)
selected_go = combobox_go.result
#auc
plt.rcParams['figure.figsize'] = (4, 2)
fpr, tpr, _ = metrics.roc_curve(slim_matrix_single_neuron.loc[selected_go],  platt_matrix.loc[selected_go])
auc = metrics.roc_auc_score(slim_matrix_single_neuron.loc[selected_go],  platt_matrix.loc[selected_go])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

plot = pd.concat([pd.DataFrame(slim_matrix_single_neuron.loc[selected_go]),pd.DataFrame(platt_matrix.loc[selected_go])], axis=1)
plot.columns = ["slim","probability"]
ax = sns.boxplot(x="slim", y="probability", data=plot,showfliers=False )

#auc 
fpr, tpr, _ = metrics.roc_curve(slim_matrix_single_neuron.loc[selected_go],  delta_logits_matrix.loc[selected_go])
auc = metrics.roc_auc_score(slim_matrix_single_neuron.loc[selected_go],  delta_logits_matrix.loc[selected_go])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

plot = pd.concat([pd.DataFrame(slim_matrix_single_neuron.loc[selected_go]),pd.DataFrame(delta_logits_matrix.loc[selected_go])], axis=1)
plot.columns = ["slim","probability"]
ax = sns.boxplot(x="slim", y="probability", data=plot,showfliers=False )

plt.rcParams['figure.figsize'] = (2, 2)
metrics.ConfusionMatrixDisplay.from_predictions(slim_matrix_single_neuron.loc[selected_go], preds_svm_matrix.loc[selected_go])
plt.grid(visible=None)

print("Accuracy:",metrics.accuracy_score(slim_matrix_single_neuron.loc[selected_go], preds_svm_matrix.loc[selected_go]))
print("Precision:",metrics.precision_score(slim_matrix_single_neuron.loc[selected_go], preds_svm_matrix.loc[selected_go]))
print("Recall:",metrics.recall_score(slim_matrix_single_neuron.loc[selected_go], preds_svm_matrix.loc[selected_go])) #TP / (TP+FN)
print("AUC with score:",auc) #TP / (TP+FN)
Accuracy: 0.9782608695652174
Precision: 0.7894736842105263
Recall: 0.9375
AUC with score: 0.9947429906542057

TN - FP

FN - TP

plt.rcParams['figure.figsize'] = (4, 2)
precision, recall, thresholds = metrics.precision_recall_curve(slim_matrix_single_neuron.loc[selected_go],  preds_svm_matrix.loc[selected_go])
auc_precision_recall = metrics.auc(recall, precision)
plt.plot(recall, precision,label=str(auc_precision_recall))
plt.legend(loc=4)
plt.show()

METRICS drugs

auc_drugs = {}
aupr_drugs = {}
precision_drugs = {}
for drug in list(slim_matrix_single_neuron.columns):
    if slim_matrix_single_neuron.loc[:,drug].sum() ==0:
        continue
    #fpr, tpr, _ = metrics.roc_curve(slim_matrix_single_neuron.loc[:,drug], logits_matrix.loc[:,drug])
    #auc_drugs[drug]  = metrics.auc(fpr, tpr)
    auc_drugs[drug] = metrics.roc_auc_score(slim_matrix_single_neuron.loc[:,drug],  platt_matrix.loc[:,drug])
    precision, recall, thresholds = metrics.precision_recall_curve(slim_matrix_single_neuron.loc[:,drug],  platt_matrix.loc[:,drug])
    aupr_drugs[drug] = metrics.auc(recall, precision)
    precision_drugs[drug] = metrics.precision_score(slim_matrix_single_neuron.loc[:,drug],  preds_svm_matrix.loc[:,drug])

auc_drugs_df = pd.DataFrame(list(auc_drugs.items()),columns = ['goterm','auc']).set_index("goterm")
auc_drugs_df = auc_drugs_df.dropna()

aupr_drugs_df = pd.DataFrame(list(aupr_drugs.items()),columns = ['goterm','aupr']).set_index("goterm")
aupr_drugs_df = aupr_drugs_df.dropna()

precision_drugs_df = pd.DataFrame(list(precision_drugs.items()),columns = ['goterm','precision']).set_index("goterm")
precision_drugs_df = precision_drugs_df.dropna()

AUC histogram drugs

sns.set(rc={'figure.figsize':(10,6)})
fig, ax = plt.subplots()
perc = str(round((100*len(auc_drugs_df[auc_drugs_df["auc"]>0.7])/len(auc_drugs_df)),2))+"%"
N, bins, patches = plt.hist(auc_drugs_df, color=CB_color_cycle[6],bins=50, linewidth=0.1)

for i in range(0,len(bins)-1):
    if bins[i]>0.7:
        patches[i].set_facecolor(CB_color_cycle[5])

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

# Second, remove the ticks as well.
ax.tick_params(bottom=False, left=True)

# Third, add a horizontal grid (but keep the vertical grid hidden).
# Color the lines a light gray as well.
ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)

plt.xlabel("AUC value", fontsize=20)
plt.ylabel("Number of drugs", fontsize=20)
colors2 = {'Drugs with AUC>=0.7':CB_color_cycle[5]}  
labels = list(colors2.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors2[label]) for label in labels]
plt.legend(handles, labels,fontsize=20, loc="lower left", bbox_to_anchor=(0.35,-0.35))
plt.text(0.79, 6, str(perc), fontsize=20,color='#333333')
plt.title("Overall performance by drugs using mutations", fontsize=24)
# con el que mejor funciona es con la suma normal del attribution 
fig.tight_layout()
fig.savefig(resultsdir+'drugsAUC.png', transparent=True)

AUC waterfall plot drugs

auc_drugs_df =auc_drugs_df.sort_values(by=["auc"], ascending=False)
plt.rcParams['figure.figsize'] = (12, 9)
drugs = auc_drugs_df.index
rhos = auc_drugs_df["auc"]

percentage = round((sum(rhos>0.69)/len(rhos))*100,1)

fig, ax = plt.subplots()
#colors = ['#208EA3' if (x < 0.5) else '#A4C61A' for x in rhos ]
colors = ['#C9C9C9' if (x < 0.69) else "#B678BE" for x in rhos ]
ax.bar(
    x=drugs,
    height=rhos,
    edgecolor=colors,
    linewidth=3
)
plt.xticks([])
plt.yticks(fontsize=28)


# First, let's remove the top, right and left spines (figure borders)
# which really aren't necessary for a bar chart.
# Also, make the bottom spine gray instead of black.
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
#ax.spines['bottom'].set_color('#DDDDDD')

# Second, remove the ticks as well.
ax.tick_params(bottom=False, left=False)

# Third, add a horizontal grid (but keep the vertical grid hidden).
# Color the lines a light gray as well.
ax.set_axisbelow(False)
ax.yaxis.grid(False)
#ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)


# Add labels and a title. Note the use of `labelpad` and `pad` to add some
# extra space between the text and the tick labels.
ax.set_xlabel('Drugs', labelpad=-30, color='#333333',fontsize=50)
ax.set_ylabel('AUC-ROC value', labelpad=15, color='#333333',fontsize=50)
ax.set_title('', color='#333333',
             weight='bold')

colors2 = {'High confidence drugs (r>0.5)':'#A4C61A'}  
labels = list(colors2.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors2[label]) for label in labels]
#plt.legend(handles, labels,fontsize=40, loc="lower left",bbox_to_anchor=(0, -0.215))
plt.text(77, 0.32, str(percentage)+"%", fontsize=60,color='#000000')

plt.ylim((-0.1,1.1))
# Make the chart fill out the figure better.
fig.tight_layout()
fig.savefig(resultsdir+'WaterfallModelsSVM_drugs.png', transparent=True)

AUPR histogram drugs

sns.set(rc={'figure.figsize':(5,3)})
perc = str(round((100*len(aupr_drugs_df[aupr_drugs_df["aupr"]>0.69])/len(aupr_drugs_df)),2))+"%"
N, bins, patches = plt.hist(aupr_drugs_df, color=CB_color_cycle[6],bins=50, linewidth=0.1)
for i in range(0,len(bins)-1):
    if bins[i]>0.69:
        patches[i].set_facecolor(CB_color_cycle[3])

plt.xlabel("AUPR drugs", fontsize=16)  
plt.title(perc, fontsize=16)
Text(0.5, 1.0, '33.62%')

Example drug prediction

def f(drug):    
    
    return drug
predictions_nodes = []
for goterm in list(platt_matrix.index):
    predictions_nodes.append(goterm+"_"+str(1))
# add names to go terms
real_go_info_svm= real_go_info[real_go_info.GO_term.isin(predictions_nodes)]
real_go_info_svm.GO_term = real_go_info_svm.GO_term.str.replace("_1","")
combobox = interactive(f, drug=widgets.Combobox(options=list(precision_drugs_df.sort_values(by=["precision"], ascending=False).index)))

Choose drug to study…

display(combobox)
selected_drug_name = combobox.result
sns.set(rc={'figure.figsize':(4,2)})
#auc
fpr, tpr, _ = metrics.roc_curve(slim_matrix_single_neuron.loc[:,selected_drug_name], platt_matrix.loc[:,selected_drug_name] )
auc = metrics.roc_auc_score(slim_matrix_single_neuron.loc[:,selected_drug_name],  platt_matrix.loc[:,selected_drug_name])
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

plot = pd.concat([pd.DataFrame(slim_matrix_single_neuron.loc[:,selected_drug_name]),pd.DataFrame(platt_matrix.loc[:,selected_drug_name])], axis=1)
plot.columns = ["slim","svm score"]
ax = sns.boxplot(x="slim", y="svm score", data=plot,showfliers=False )

plot = pd.concat([pd.DataFrame(slim_matrix.loc[:,selected_drug_name]),pd.DataFrame(attribution_data_annotated.loc[:,selected_drug_name]*1e4)], axis=1)
plot.columns = ["slim","attribution"]
ax = sns.boxplot(x="slim", y="attribution", data=plot,showfliers=True )

metrics.ConfusionMatrixDisplay.from_predictions(slim_matrix_single_neuron.loc[:,selected_drug_name].round(), preds_svm_matrix.loc[:,selected_drug_name])
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2502863c8b0>

print("Accuracy:",metrics.accuracy_score(slim_matrix_single_neuron.loc[:,selected_drug_name], preds_svm_matrix.loc[:,selected_drug_name]))
print("Precision:",metrics.precision_score(slim_matrix_single_neuron.loc[:,selected_drug_name], preds_svm_matrix.loc[:,selected_drug_name]))
print("Recall:",metrics.recall_score(slim_matrix_single_neuron.loc[:,selected_drug_name], preds_svm_matrix.loc[:,selected_drug_name])) #TP / (TP+FN)
print("AUC with score:",auc) 
Accuracy: 0.663471778487753
Precision: 0.8651026392961877
Recall: 0.5221238938053098
AUC with score: 0.9947429906542057
# LOS LOGITS DE TEST!!
train_drug_logs = pd.DataFrame(delta_logits_matrix.loc[:,selected_drug_name]).reset_index()
train_drug_logs.columns  = ["GO_term","probability"]
train_drug_logs = train_drug_logs.merge(real_go_info_svm, on="GO_term")
train_drug_logs.sort_values(by=["probability"], ascending=False)

Final model SVM

Once the models have been cross-validated we create the final models using all samples…

GO_terms_auc_svm_final = {}
GO_terms_aupr_svm_final = {}
GO_terms_precision_svm_final = {}
models_svm = {}

# Perform logistics
for goterm in sparseGO_terms:
    #print(goterm)
    goterm_drugs = slim_matrix.loc[[goterm+"_"+str(1)]].values.flatten()
    
    if sum(goterm_drugs) <= 8:
        continue

    list_nodes = []
    for i in range(1,7):
        list_nodes.append(goterm+"_"+str(i))

    score = attribution_data_annotated.loc[list_nodes].T
    score_mod = score.divide(score.std()).fillna(0)
    
    X_train = score_mod
    X_test = score_mod
    y_train = goterm_drugs
    y_test = goterm_drugs
    
    #gamma = 1/(X_train.shape[1]*X_train.to_numpy().var())
    gamma="scale"
    C=1
    
    
        
    svm_model = svm.SVC(C=C,gamma=gamma, kernel='rbf',
                           class_weight="balanced",
                            tol=0.001,
                            probability=True,
                           random_state=1234)
    # fit the model with data
    svm_model.fit(X_train,y_train)
    y_pred=svm_model.predict(X_test)

    #auc
    y_pred_proba = svm_model.predict_proba(X_test)[::,1]  # platt values
    #y_pred_proba = svm_model.decision_function(X_test)
    
    GO_terms_auc_svm_final[goterm] = metrics.roc_auc_score(y_test, y_pred_proba)

    precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_proba)
    GO_terms_aupr_svm_final[goterm] = metrics.auc(recall, precision)
    GO_terms_precision_svm_final[goterm] = metrics.precision_score(y_test, y_pred)
    models_svm[goterm]=svm_model
len(models_svm)
939

Final model AUC

GO_terms_auc_svm_df_final = pd.DataFrame(list(GO_terms_auc_svm_final.items()),columns = ['goterm','auc']).set_index("goterm")
GO_terms_auc_svm_df_final = GO_terms_auc_svm_df_final.dropna()
GO_terms_auc_svm_df_final.sort_values(by=["auc"], ascending=False)
auc
goterm
GO:0036289 1.000000
GO:0060440 0.998540
GO:0043162 0.995455
GO:0070059 0.994760
GO:0071364 0.994109
GO:1901029 0.994048
GO:0072384 0.993636
GO:0051453 0.993393
GO:0001556 0.991972
GO:0090201 0.991808
GO:0010750 0.990909
GO:0016573 0.990783
GO:1903800 0.990573
GO:1904950 0.989945
GO:1902455 0.989091
GO:0042149 0.987697
GO:0034983 0.987273
GO:1990403 0.985909
GO:0071353 0.985587
GO:0006275 0.984226
GO:0010971 0.984091
GO:0006869 0.983409
GO:0001779 0.983182
GO:0051973 0.981651
GO:0060749 0.980895
GO:0042771 0.980633
GO:0072655 0.980455
GO:0061734 0.980455
GO:0045636 0.980178
GO:0045737 0.980084
GO:1902236 0.979762
GO:0060632 0.979545
GO:0016575 0.978731
GO:0042659 0.977727
GO:0046628 0.977376
GO:1902042 0.977273
GO:0098780 0.975909
GO:0046902 0.975849
GO:0051607 0.975552
GO:0006401 0.974678
GO:0017157 0.974040
GO:0032740 0.973856
GO:0006270 0.973848
GO:0046666 0.973570
GO:0008045 0.972603
GO:0006303 0.972553
GO:0042177 0.972431
GO:0060020 0.972290
GO:0006360 0.972095
GO:2001021 0.971520
GO:0042733 0.971364
GO:0016572 0.971342
GO:0070932 0.970909
GO:2001257 0.970909
GO:0001782 0.970384
GO:0006261 0.970112
GO:1905564 0.969834
GO:2000757 0.969545
GO:0051354 0.969091
GO:0072284 0.969069
GO:0051926 0.968891
GO:0043407 0.968585
GO:0034394 0.968096
GO:0050870 0.967621
GO:0046898 0.967143
GO:0031047 0.967115
GO:0016925 0.966364
GO:0035790 0.966361
GO:0006417 0.965261
GO:0032469 0.965008
GO:0035195 0.964816
GO:0021782 0.964091
GO:0070584 0.963810
GO:0051384 0.961083
GO:0002326 0.960811
GO:2000773 0.960310
GO:0050729 0.959779
GO:0046942 0.959480
GO:0035249 0.959091
GO:0045821 0.958904
GO:0099111 0.958880
GO:0071670 0.958851
GO:0006367 0.958333
GO:1905278 0.958270
GO:0010559 0.957929
GO:0006959 0.957854
GO:0018205 0.957782
GO:0035860 0.957768
GO:0031640 0.957381
GO:0007059 0.957268
GO:0070373 0.956762
GO:0030282 0.956762
GO:0001658 0.956522
GO:0030890 0.956075
GO:0035754 0.955757
GO:0010832 0.955455
GO:0099173 0.955238
GO:0021695 0.955238
GO:0045727 0.955026
GO:0002862 0.954696
GO:0014827 0.954432
GO:0016579 0.953923
GO:0002718 0.953854
GO:0071320 0.953746
GO:0051281 0.953182
GO:0042552 0.953182
GO:0000086 0.953095
GO:0032147 0.952991
GO:0032436 0.952499
GO:0010592 0.952273
GO:0006694 0.951735
GO:0033141 0.951735
GO:0071480 0.951429
GO:0006612 0.951118
GO:0048011 0.950729
GO:1903077 0.950714
GO:0033619 0.950455
GO:0006352 0.950306
GO:0001662 0.950221
GO:0010039 0.950040
GO:0090314 0.949147
GO:0034502 0.949074
GO:0014823 0.948954
GO:2001240 0.948220
GO:0007617 0.948182
GO:0032743 0.947281
GO:0006310 0.947141
GO:0006605 0.946678
GO:0006975 0.946204
GO:2000739 0.946101
GO:1902459 0.945909
GO:0007626 0.945701
GO:0023019 0.945116
GO:0003376 0.944700
GO:0006576 0.944346
GO:0038007 0.943690
GO:0050728 0.943637
GO:0032922 0.942661
GO:0045740 0.942465
GO:1900118 0.942381
GO:0010952 0.942143
GO:1905710 0.942143
GO:1902166 0.942128
GO:0008637 0.941950
GO:2000010 0.941865
GO:0055118 0.941679
GO:0000423 0.941364
GO:0043154 0.941156
GO:0048701 0.940775
GO:0008210 0.940749
GO:1900272 0.940171
GO:0060997 0.939809
GO:0007263 0.939545
GO:2000379 0.939167
GO:1900020 0.939091
GO:0050896 0.938915
GO:0016485 0.938636
GO:0043966 0.938376
GO:0002437 0.938295
GO:2000300 0.937318
GO:0140013 0.937095
GO:0034767 0.936758
GO:0031648 0.936624
GO:0007026 0.936364
GO:0032024 0.936149
GO:0030193 0.936040
GO:0010212 0.935098
GO:0006457 0.934641
GO:0032729 0.934420
GO:0030593 0.934413
GO:0010575 0.934272
GO:0008064 0.933643
GO:0008286 0.932331
GO:0001818 0.932128
GO:0030513 0.931404
GO:0060766 0.931364
GO:0006396 0.931346
GO:0006919 0.931342
GO:0038096 0.930886
GO:0001553 0.930810
GO:0045580 0.930407
GO:0046326 0.930406
GO:0035025 0.930294
GO:1903146 0.929091
GO:0060444 0.929091
GO:0006412 0.928571
GO:0048536 0.928290
GO:0002819 0.927685
GO:0048704 0.927370
GO:0051054 0.927333
GO:0090184 0.927099
GO:1900006 0.926941
GO:2000134 0.926917
GO:0046889 0.926822
GO:0043123 0.926512
GO:0070842 0.926364
GO:0046329 0.926364
GO:0006898 0.925891
GO:0006368 0.925841
GO:1905897 0.925743
GO:0030048 0.925591
GO:0042180 0.925076
GO:0035909 0.924883
GO:0051209 0.924065
GO:0030308 0.923951
GO:0043170 0.923707
GO:0035726 0.922783
GO:0031663 0.922727
GO:0000209 0.922119
GO:0009165 0.921544
GO:0002720 0.921427
GO:0006096 0.921292
GO:1902036 0.921254
GO:0071549 0.921066
GO:0007528 0.920950
GO:0090090 0.920930
GO:0042472 0.920455
GO:0031056 0.920429
GO:0050864 0.920262
GO:0060789 0.920000
GO:0007389 0.919762
GO:0048743 0.919572
GO:0030705 0.919116
GO:0060179 0.919091
GO:0045739 0.918823
GO:0043627 0.917977
GO:0040018 0.917659
GO:2001243 0.917078
GO:0090037 0.917056
GO:0040016 0.915987
GO:0043552 0.915951
GO:0001666 0.915013
GO:0010508 0.914755
GO:0033690 0.914545
GO:0098586 0.914419
GO:0043922 0.914091
GO:0035994 0.914021
GO:0031398 0.913694
GO:0042093 0.913524
GO:0032410 0.913182
GO:1901224 0.913182
GO:0006839 0.913167
GO:0045907 0.912844
GO:2000278 0.912619
GO:2001236 0.912563
GO:0048170 0.912474
GO:0071839 0.912217
GO:0031507 0.911552
GO:0060391 0.911011
GO:0032148 0.910451
GO:0070102 0.910000
GO:0030878 0.909762
GO:0035162 0.909463
GO:0051225 0.909314
GO:0002931 0.909064
GO:0007411 0.908683
GO:0008625 0.908500
GO:0035788 0.908313
GO:0010921 0.907360
GO:0048266 0.906977
GO:0010977 0.906667
GO:0050910 0.906656
GO:0045732 0.906062
GO:0046620 0.905714
GO:0035855 0.905551
GO:0030316 0.905551
GO:0006469 0.905340
GO:0090263 0.905136
GO:0021953 0.904874
GO:0060312 0.904790
GO:0006260 0.904703
GO:0030521 0.904434
GO:0008016 0.904091
GO:0010727 0.904091
GO:0030509 0.904035
GO:0007498 0.903914
GO:0050769 0.903592
GO:0050792 0.903414
GO:0009582 0.903167
GO:0007098 0.902745
GO:0002821 0.902464
GO:0071276 0.902162
GO:0007286 0.901132
GO:0045088 0.900952
GO:0055003 0.900943
GO:0035767 0.900748
GO:0045987 0.900474
GO:0061029 0.900474
GO:0033327 0.900465
GO:0000422 0.900374
GO:0010976 0.900117
GO:0008354 0.899895
GO:0070528 0.899726
GO:0006807 0.899601
GO:0045833 0.899128
GO:1905065 0.898923
GO:0007018 0.898915
GO:0007422 0.898647
GO:0048484 0.898636
GO:0032467 0.898182
GO:0050795 0.897909
GO:0030539 0.897909
GO:0048538 0.897833
GO:0032355 0.897646
GO:0007416 0.897554
GO:0021575 0.897509
GO:0060348 0.897410
GO:0001569 0.897282
GO:0060384 0.897171
GO:0031069 0.897099
GO:0050918 0.897059
GO:0035584 0.896905
GO:0051046 0.896369
GO:0043129 0.896233
GO:0001843 0.896024
GO:0046330 0.895444
GO:0007030 0.895429
GO:0048873 0.895092
GO:0000724 0.894922
GO:0007202 0.894511
GO:1903053 0.894419
GO:0003338 0.894238
GO:1901990 0.894150
GO:0060644 0.893917
GO:0043161 0.893782
GO:0030838 0.892727
GO:0001946 0.892571
GO:0072210 0.892039
GO:0030101 0.892003
GO:0050731 0.892003
GO:0010613 0.891865
GO:0030325 0.891865
GO:0048714 0.891783
GO:0048008 0.891667
GO:0001823 0.890989
GO:0016239 0.890496
GO:0030216 0.890460
GO:0071300 0.890341
GO:0032008 0.889952
GO:0061045 0.889881
GO:0051894 0.889619
GO:0030010 0.889612
GO:0031016 0.889533
GO:0001942 0.889526
GO:1902533 0.889155
GO:0016358 0.888660
GO:0001501 0.888280
GO:0051092 0.888251
GO:0016601 0.887883
GO:0097067 0.887324
GO:0009306 0.887019
GO:0048167 0.886555
GO:0050921 0.886315
GO:1990384 0.886268
GO:0046883 0.886202
GO:0007519 0.886154
GO:0043270 0.885881
GO:0003007 0.885720
GO:0071900 0.885420
GO:0007585 0.885391
GO:2001214 0.885258
GO:0071456 0.884685
GO:0016567 0.884594
GO:0060740 0.882856
GO:0035094 0.882732
GO:0072073 0.882732
GO:0060612 0.881602
GO:0060325 0.881498
GO:0045668 0.881347
GO:0042531 0.881332
GO:0010038 0.881167
GO:0071333 0.880972
GO:0006939 0.880907
GO:0090141 0.880907
GO:0046718 0.880697
GO:0051770 0.880461
GO:0033627 0.880455
GO:0048149 0.880352
GO:0002685 0.880291
GO:0043029 0.880195
GO:0038033 0.879699
GO:0055119 0.879336
GO:0003300 0.878843
GO:0005984 0.878788
GO:0002218 0.878773
GO:0072239 0.878669
GO:0031103 0.878667
GO:0048557 0.878638
GO:1901987 0.878627
GO:0060048 0.877703
GO:0045637 0.877659
GO:2001234 0.877406
GO:0038083 0.876762
GO:0071277 0.876323
GO:0048839 0.876278
GO:0000723 0.875714
GO:0060627 0.875648
GO:0035022 0.874811
GO:0007435 0.874669
GO:2001241 0.874309
GO:0002062 0.874091
GO:0035234 0.873792
GO:0034976 0.873754
GO:0007584 0.872411
GO:0002318 0.872408
GO:0001975 0.872354
GO:0071230 0.871837
GO:0034446 0.871788
GO:0070933 0.871364
GO:0030072 0.871331
GO:0071897 0.871171
GO:0035733 0.870478
GO:0032967 0.870403
GO:0048675 0.870071
GO:0060571 0.870035
GO:0050920 0.869917
GO:0050678 0.869106
GO:0034405 0.869048
GO:0051150 0.868932
GO:0001934 0.868720
GO:0010507 0.868700
GO:1904707 0.868636
GO:0050821 0.868325
GO:0006811 0.868262
GO:0070588 0.868155
GO:0014911 0.867596
GO:0090280 0.867440
GO:0008630 0.867386
GO:1901796 0.867386
GO:0051056 0.867368
GO:0051321 0.865996
GO:0051051 0.865833
GO:0051902 0.865573
GO:0097009 0.865089
GO:0060271 0.865061
GO:0045930 0.864995
GO:0035304 0.864977
GO:0051899 0.864866
GO:0033028 0.864808
GO:0018108 0.864767
GO:1900087 0.864434
GO:0010467 0.863952
GO:0035019 0.863557
GO:0006687 0.863557
GO:0001824 0.863532
GO:0033689 0.863522
GO:0071392 0.863443
GO:0035264 0.863252
GO:0046632 0.862800
GO:0034605 0.862619
GO:0032091 0.862599
GO:0072659 0.862358
GO:0051901 0.861670
GO:0006357 0.861504
GO:0042475 0.861448
GO:0045747 0.861374
GO:0072006 0.860598
GO:0042220 0.860483
GO:0006937 0.860353
GO:0006511 0.860111
GO:0010718 0.859229
GO:0035924 0.859169
GO:0090398 0.859050
GO:0031532 0.858981
GO:1904062 0.858745
GO:2000251 0.858605
GO:0014068 0.858156
GO:0048146 0.858102
GO:0051090 0.857756
GO:0034765 0.857317
GO:0007229 0.856812
GO:0007158 0.856712
GO:1901031 0.856712
GO:0061351 0.856372
GO:1904019 0.856183
GO:0048812 0.856107
GO:0060437 0.855565
GO:0034766 0.854758
GO:0033143 0.854574
GO:0007269 0.854497
GO:0032516 0.854484
GO:0036120 0.854433
GO:0090068 0.853947
GO:0046854 0.853881
GO:0010811 0.853842
GO:0060976 0.853774
GO:0060045 0.853680
GO:0021549 0.853311
GO:0043534 0.853142
GO:0038084 0.853135
GO:0046427 0.852947
GO:0030324 0.852866
GO:0048010 0.852488
GO:0097193 0.852297
GO:0048286 0.852143
GO:0006468 0.851852
GO:0060326 0.851772
GO:0034097 0.851678
GO:0016071 0.851667
GO:0036324 0.851085
GO:1903010 0.851085
GO:0002327 0.850962
GO:0001570 0.850955
GO:0043536 0.850601
GO:0043406 0.850494
GO:0045347 0.850455
GO:0001701 0.850196
GO:0019222 0.849913
GO:0051403 0.849741
GO:0097021 0.849170
GO:0043467 0.848706
GO:0045766 0.848621
GO:0060562 0.848060
GO:0030001 0.847486
GO:0006810 0.847446
GO:0031667 0.847070
GO:0048565 0.846000
GO:0019827 0.845649
GO:0007565 0.845356
GO:0009966 0.844893
GO:0055085 0.844768
GO:0043114 0.844749
GO:0002548 0.844626
GO:2000377 0.844341
GO:0030198 0.844187
GO:0032386 0.844167
GO:0031929 0.844150
GO:0035306 0.843956
GO:0006897 0.843955
GO:0051301 0.843815
GO:0001656 0.843809
GO:0042060 0.843773
GO:0031109 0.843563
GO:0000122 0.843521
GO:0043124 0.843017
GO:0001837 0.842638
GO:1902275 0.841719
GO:0051261 0.841719
GO:0051924 0.841520
GO:0002250 0.841465
GO:0030336 0.841059
GO:0046631 0.840909
GO:0016055 0.840841
GO:0033077 0.840735
GO:0048741 0.840370
GO:0007266 0.839667
GO:0001938 0.838948
GO:0043586 0.838898
GO:0008277 0.837920
GO:0043303 0.837858
GO:0070662 0.837526
GO:0060374 0.836916
GO:0045087 0.836889
GO:0034220 0.836107
GO:0032388 0.835532
GO:0048568 0.835305
GO:0050866 0.835227
GO:0009058 0.834946
GO:1902074 0.834912
GO:0043244 0.834906
GO:0008542 0.834749
GO:0045055 0.834433
GO:0045444 0.834286
GO:0046578 0.834019
GO:0046777 0.833773
GO:0001889 0.833595
GO:0008584 0.833556
GO:0045840 0.833536
GO:0002366 0.833530
GO:0007049 0.833424
GO:0046474 0.833392
GO:0019233 0.833182
GO:0000165 0.832917
GO:0051258 0.832656
GO:0032956 0.832450
GO:0022612 0.832326
GO:0051050 0.832281
GO:0043392 0.831905
GO:0031274 0.831814
GO:0051702 0.831506
GO:0010564 0.831039
GO:0031099 0.830615
GO:1905563 0.830607
GO:0030318 0.830136
GO:0048598 0.829861
GO:0007165 0.829719
GO:1901988 0.829474
GO:0007186 0.829429
GO:0033157 0.829023
GO:0019221 0.829000
GO:0000278 0.828800
GO:0042310 0.828784
GO:1901300 0.828616
GO:0006909 0.828497
GO:0030154 0.828332
GO:0002573 0.827001
GO:0045429 0.826889
GO:0051223 0.826823
GO:0016570 0.826822
GO:0030163 0.826442
GO:0009791 0.826355
GO:0090630 0.826069
GO:0032409 0.825426
GO:0048477 0.824868
GO:0034644 0.824849
GO:0007346 0.824841
GO:0046651 0.824539
GO:0051171 0.823977
GO:0000302 0.823816
GO:0048608 0.823637
GO:0032940 0.823481
GO:0008610 0.823469
GO:0010628 0.823151
GO:1903078 0.822244
GO:0016032 0.821730
GO:0009888 0.821458
GO:0016042 0.821320
GO:0007259 0.820971
GO:0008544 0.820813
GO:0000077 0.820719
GO:0021766 0.820586
GO:0001817 0.819733
GO:0001932 0.819683
GO:0002053 0.819493
GO:0072593 0.819390
GO:0009887 0.819242
GO:0006753 0.818971
GO:0071383 0.818684
GO:0007015 0.818627
GO:0001819 0.818452
GO:0007275 0.818394
GO:1903829 0.818083
GO:0002244 0.818060
GO:0051898 0.817795
GO:0009410 0.817265
GO:0030335 0.817025
GO:0061024 0.816492
GO:0007173 0.816349
GO:0050900 0.816242
GO:0060395 0.815909
GO:0009755 0.815667
GO:0045860 0.815613
GO:0050872 0.815367
GO:0007612 0.814548
GO:0000082 0.814519
GO:0050852 0.814267
GO:0043408 0.813977
GO:0002009 0.813874
GO:0019752 0.813530
GO:0001822 0.813506
GO:0007179 0.813500
GO:0051049 0.813439
GO:0010033 0.813421
GO:1901135 0.813379
GO:1900180 0.813213
GO:0033554 0.813172
GO:0007204 0.813136
GO:0044770 0.812960
GO:0001755 0.812831
GO:0001541 0.812614
GO:0006470 0.811795
GO:0009743 0.811594
GO:0033993 0.811585
GO:0035265 0.811041
GO:0051496 0.811040
GO:0007162 0.810927
GO:0030218 0.809955
GO:0006139 0.809816
GO:0070374 0.808642
GO:0006298 0.808612
GO:0009056 0.808581
GO:0070507 0.808431
GO:0071363 0.808295
GO:0050680 0.808234
GO:0007169 0.807939
GO:0001894 0.807870
GO:0000902 0.806862
GO:0009617 0.806711
GO:1902904 0.806512
GO:0030097 0.806125
GO:0007399 0.805949
GO:0050853 0.805230
GO:0051726 0.804914
GO:0008360 0.804780
GO:0050863 0.804772
GO:0010629 0.804702
GO:0032880 0.804305
GO:0021795 0.804198
GO:0046488 0.804184
GO:0031032 0.804004
GO:0045595 0.803077
GO:0006936 0.802344
GO:0045793 0.802149
GO:0071222 0.801980
GO:0051897 0.801416
GO:0006606 0.800953
GO:0006886 0.800872
GO:0030307 0.800490
GO:0048738 0.800331
GO:0010821 0.800220
GO:0051247 0.800154
GO:0042752 0.800120
GO:0032835 0.800025
GO:0033138 0.799982
GO:1903578 0.799701
GO:0050673 0.798946
GO:0006997 0.798672
GO:0060341 0.798662
GO:0006281 0.798556
GO:0042391 0.798475
GO:0050808 0.797394
GO:0007267 0.797360
GO:0050865 0.797107
GO:0018105 0.797070
GO:0060560 0.796569
GO:0071478 0.796131
GO:0018107 0.796045
GO:0019216 0.795977
GO:0023061 0.795969
GO:0036473 0.795897
GO:0051147 0.795455
GO:0006996 0.794900
GO:0030217 0.794761
GO:0070527 0.794579
GO:0050804 0.793936
GO:0060021 0.793808
GO:0045321 0.793792
GO:0046034 0.792891
GO:1904646 0.792812
GO:0030182 0.792624
GO:0002764 0.790893
GO:0007596 0.790844
GO:0043542 0.790474
GO:0006355 0.790400
GO:0010638 0.790227
GO:0042110 0.789916
GO:2000811 0.789519
GO:0045785 0.789271
GO:0001952 0.789204
GO:0048709 0.787833
GO:0016192 0.787802
GO:0002320 0.787705
GO:0045944 0.787650
GO:0035051 0.787216
GO:0070663 0.786907
GO:0046486 0.786765
GO:0006914 0.786701
GO:0071407 0.786480
GO:0048468 0.786471
GO:0043065 0.786229
GO:1902532 0.786009
GO:0033044 0.785934
GO:0031333 0.785379
GO:0071417 0.785307
GO:0016241 0.785238
GO:0007268 0.785105
GO:0007010 0.785047
GO:0002443 0.783904
GO:2000270 0.783308
GO:0001764 0.782709
GO:0051174 0.781935
GO:0034329 0.781439
GO:0043549 0.781269
GO:0010595 0.781136
GO:2001020 0.780899
GO:0050776 0.780250
GO:0007159 0.780220
GO:0048041 0.780105
GO:0016236 0.779569
GO:0048638 0.778556
GO:0042551 0.778521
GO:0007517 0.778474
GO:0032869 0.777921
GO:0051649 0.777222
GO:0009725 0.777056
GO:0030855 0.776398
GO:0002040 0.776347
GO:0071310 0.775759
GO:0042063 0.775499
GO:0009266 0.775262
GO:0048469 0.774721
GO:0042307 0.774054
GO:0032879 0.772742
GO:0002376 0.772696
GO:0055082 0.772549
GO:0016070 0.772150
GO:0060840 0.771853
GO:0010632 0.771656
GO:0007219 0.771429
GO:0051341 0.770833
GO:0060416 0.770267
GO:0090050 0.770256
GO:0002274 0.770035
GO:0009968 0.768538
GO:0009416 0.768293
GO:0009653 0.767978
GO:0030183 0.767941
GO:0007507 0.766819
GO:0007283 0.766625
GO:0048589 0.766590
GO:0050790 0.766284
GO:0065003 0.765562
GO:0030032 0.765559
GO:0048103 0.765258
GO:0006954 0.764565
GO:0048878 0.764329
GO:0007420 0.764092
GO:0030168 0.762921
GO:0006629 0.761422
GO:0006644 0.760398
GO:0001525 0.760172
GO:0120035 0.759979
GO:0034103 0.759958
GO:0014070 0.759563
GO:0044255 0.758471
GO:0051098 0.758377
GO:0051641 0.757853
GO:0034599 0.756607
GO:0043473 0.756079
GO:0036092 0.755500
GO:0048863 0.755435
GO:2000352 0.754950
GO:0030162 0.754327
GO:0042325 0.754119
GO:0008202 0.754059
GO:0033628 0.753988
GO:0051146 0.753713
GO:0010243 0.753077
GO:0043524 0.752485
GO:0003014 0.752381
GO:0002684 0.752271
GO:0001763 0.751863
GO:0051145 0.751530
GO:0045596 0.750733
GO:0000226 0.750411
GO:0031175 0.749603
GO:0007155 0.749178
GO:0002064 0.748667
GO:0045597 0.748593
GO:0040008 0.748313
GO:0060485 0.746706
GO:0006508 0.746456
GO:0097191 0.746084
GO:0016477 0.745481
GO:0005975 0.745307
GO:0043066 0.745136
GO:0050890 0.744664
GO:0007265 0.744598
GO:0032092 0.743751
GO:0051017 0.743352
GO:0007005 0.742652
GO:0043434 0.742583
GO:0003158 0.742221
GO:0042113 0.742005
GO:0120162 0.741508
GO:0051881 0.740169
GO:0030522 0.740000
GO:0007160 0.739980
GO:0048511 0.737766
GO:0044281 0.736585
GO:0007568 0.736500
GO:0007610 0.735604
GO:0035556 0.734890
GO:0048017 0.734281
GO:0006325 0.734050
GO:0006915 0.733434
GO:0008284 0.731624
GO:0045165 0.731183
GO:0002682 0.730932
GO:0022414 0.730731
GO:0097190 0.730349
GO:0051494 0.729426
GO:0051128 0.728822
GO:0043254 0.728497
GO:0098609 0.727096
GO:0006338 0.726229
GO:0007423 0.725167
GO:0001649 0.721908
GO:0048661 0.721041
GO:0010941 0.719507
GO:1900407 0.716777
GO:0007166 0.714859
GO:0002683 0.708778
GO:1902903 0.708773
GO:0008285 0.705969
GO:0030900 0.704312
GO:0034504 0.704188
GO:0033365 0.703782
GO:0070997 0.703482
GO:0033043 0.697664
GO:0051249 0.687925
GO:0008104 0.675076
GO:0033002 0.627125
GO:0042593 0.364293
GO:0071887 0.359703
GO:0044262 0.349744
GO:0051640 0.348052
GO:0051000 0.299934
GO:0050778 0.290903
GO:0007156 0.278928
GO:0008361 0.265902
GO:0070301 0.262988
GO:0022407 0.258553
GO:0015031 0.257560
GO:0043525 0.248848
GO:0051353 0.242570
GO:0043086 0.239824
GO:0045471 0.229314
GO:0051497 0.208992
GO:0031529 0.208648
GO:0099504 0.206822
GO:0043547 0.194217
GO:1904659 0.191457
GO:0031334 0.184335
GO:0046677 0.183479
GO:0015980 0.180476
GO:0060291 0.175234
GO:0009259 0.166290
GO:0060173 0.162212
GO:0042632 0.145299
GO:0046890 0.144186
GO:0032760 0.142722
GO:0051302 0.135000
GO:0031295 0.134696
GO:0019318 0.123006
GO:0010951 0.120040
GO:0021987 0.119137
GO:0006163 0.118024
GO:0030041 0.107955
GO:0001892 0.106324
GO:0030512 0.105991
GO:0060079 0.105991
GO:0050770 0.098547
GO:0051928 0.097553
GO:0031397 0.094042
GO:0060041 0.082956
GO:0051047 0.076258
GO:0019722 0.041730
GO:0090042 0.035699
sns.set(rc={'figure.figsize':(6,4)})
perc = str(round((100*len(GO_terms_auc_svm_df_final[GO_terms_auc_svm_df_final["auc"]>0.7])/len(GO_terms_auc_svm_df_final)),2))+"%"
N, bins, patches = plt.hist(GO_terms_auc_svm_df_final, color=CB_color_cycle[6],bins=50, linewidth=0.1)

for i in range(0,len(bins)-1):
    if bins[i]>0.7:
        patches[i].set_facecolor(CB_color_cycle[2])

plt.xlabel("AUC (logistic 1)", fontsize=16)  
plt.title(perc, fontsize=16)
# con el que mejor funciona es con la suma normal del attribution 
Text(0.5, 1.0, '94.68%')

Final model AUPR

GO_terms_aupr_svm_df_final = pd.DataFrame(list(GO_terms_aupr_svm_final.items()),columns = ['goterm','aupr']).set_index("goterm")
GO_terms_aupr_svm_df_final = GO_terms_aupr_svm_df_final.dropna()
GO_terms_aupr_svm_df_final.sort_values(by=["aupr"], ascending=False).head()
aupr
goterm
GO:0036289 1.000000
GO:0050896 0.995438
GO:0043170 0.989680
GO:0006807 0.987396
GO:0060440 0.978213
# TENGO PROBLEMA CON EL RECALL 
sns.set(rc={'figure.figsize':(5,3)})
perc = str(round((100*len(GO_terms_aupr_svm_df_final[GO_terms_aupr_svm_df_final["aupr"]>0.7])/len(GO_terms_aupr_svm_df_final)),2))+"%"
N, bins, patches = plt.hist(GO_terms_aupr_svm_df_final, color=CB_color_cycle[6],bins=50, linewidth=0.1)
for i in range(0,len(bins)-1):
    if bins[i]>0.7:
        patches[i].set_facecolor(CB_color_cycle[3])

plt.xlabel("AUPR", fontsize=16)  
plt.title(perc, fontsize=16)
Text(0.5, 1.0, '20.02%')

Predict for a new drug

Make predictions

unknown = list(set(attribution_data_all.columns)-set(attribution_data_annotated.columns))

Get the probabilities for all unknown drugs

predictions = {}
distances = {}
probabilities_unknown = pd.DataFrame()
preds_unknown = pd.DataFrame()

for drug in unknown:
    probabilities = {}
    for goterm in models_svm.keys():

        list_nodes = list(models_svm[goterm].feature_names_in_) # Extract the feature names from the model (those are the attributions we need)

        score = attribution_data_all.loc[list_nodes][drug].to_frame().T 
        score_mod = score.divide(attribution_data_annotated.loc[list_nodes].T.std()).fillna(0) #divide by std of each neuron, only use drugs that trained the models

        predictions[goterm]=models_svm[goterm].predict(score_mod)
        probabilities[goterm] = models_svm[goterm].predict_proba(score_mod)[::,1]  # platt values
        # distances[goterm] = models_svm[goterm].decision_function(score_mod)
        
    drug_probs = pd.DataFrame.from_dict(probabilities).T
    drug_probs.columns = [drug]
    drug_preds = pd.DataFrame.from_dict(predictions).T
    drug_preds.columns = [drug]    
    probabilities_unknown = pd.concat([probabilities_unknown,drug_probs], axis=1)
    preds_unknown = pd.concat([preds_unknown,drug_preds], axis=1)
    print(drug)

Study drug with unknown MOA

Choose drug with unknown MOA…

combobox_u = interactive(f, drug=widgets.Combobox(options=unknown))
predictions_nodes = []
for goterm in list(platt_matrix.index):
    predictions_nodes.append(goterm+"_"+str(1))
# add names to go terms
real_go_info_svm= real_go_info[real_go_info.GO_term.isin(predictions_nodes)]
real_go_info_svm.GO_term = real_go_info_svm.GO_term.str.replace("_1","")
display(combobox_u)
selected_drug_u_name = combobox_u.result
predictions_df = pd.DataFrame.from_dict(preds_unknown.loc[:,selected_drug_u_name]).reset_index()
predictions_df.columns  = ["GO_term","predictions"]
probabilities_df = pd.DataFrame.from_dict(probabilities_unknown.loc[:,selected_drug_u_name]).reset_index()
probabilities_df.columns  = ["GO_term","probability"]
probabilities_df = probabilities_df.merge(real_go_info_svm, on="GO_term")
probabilities_df = probabilities_df.merge(predictions_df, on="GO_term")
probabilities_df.loc[probabilities_df["layer_number"] <=3].sort_values(by=["probability"], ascending=False).head(200)
GO_term probability Name layer_number predictions
820 GO:0033993 0.809556 Response to lipid (1) 3.0 1.0
223 GO:0018108 0.783375 Peptidyl-tyrosine phosphorylation (1) 3.0 1.0
573 GO:0010629 0.742615 Negative regulation of gene expression (1) 3.0 1.0
106 GO:0071900 0.725062 Regulation of protein serine/threonine kinase activity (1) 2.0 1.0
624 GO:0010628 0.702924 Positive regulation of gene expression (1) 3.0 1.0
74 GO:0001817 0.687600 Regulation of cytokine production (1) 3.0 1.0
44 GO:0048812 0.672129 Neuron projection morphogenesis (1) 3.0 1.0
224 GO:0046777 0.661031 Protein autophosphorylation (1) 1.0 1.0
99 GO:0001934 0.658374 Positive regulation of protein phosphorylation (1) 3.0 1.0
570 GO:0045597 0.628072 Positive regulation of cell differentiation (1) 3.0 1.0
839 GO:0031047 0.553888 Gene silencing by rna (1) 2.0 1.0
100 GO:0033138 0.542770 Positive regulation of peptidyl-serine phosphorylation (1) 1.0 1.0
888 GO:0034976 0.540483 Response to endoplasmic reticulum stress (1) 3.0 1.0
633 GO:0051301 0.535293 Cell division (1) 2.0 1.0
821 GO:0034097 0.533600 Response to cytokine (1) 3.0 1.0
423 GO:1902533 0.530199 Positive regulation of intracellular signal transduction (1) 2.0 1.0
596 GO:0060341 0.523390 Regulation of cellular localization (1) 3.0 0.0
729 GO:0120035 0.514954 Regulation of plasma membrane bounded cell projection organization (1) 3.0 1.0
568 GO:0008284 0.513916 Positive regulation of cell population proliferation (1) 2.0 1.0
558 GO:0016032 0.500000 Viral process (1) 3.0 1.0
641 GO:0071417 0.494543 Cellular response to organonitrogen compound (1) 3.0 1.0
353 GO:0006954 0.482633 Inflammatory response (1) 3.0 1.0
9 GO:0043408 0.476159 Regulation of mapk cascade (1) 2.0 1.0
808 GO:1902532 0.449154 Negative regulation of intracellular signal transduction (1) 3.0 1.0
11 GO:0043406 0.445845 Positive regulation of map kinase activity (1) 1.0 1.0
8 GO:0000165 0.443783 Mapk cascade (1) 3.0 0.0
358 GO:0007005 0.427023 Mitochondrion organization (1) 3.0 1.0
134 GO:0002366 0.423235 Leukocyte activation involved in immune response (1) 3.0 1.0
221 GO:0018105 0.422261 Peptidyl-serine phosphorylation (1) 2.0 0.0
285 GO:0051051 0.421500 Negative regulation of transport (1) 3.0 1.0
191 GO:0045944 0.420524 Positive regulation of transcription by rna polymerase ii (1) 2.0 1.0
654 GO:0090398 0.416328 Cellular senescence (1) 1.0 1.0
847 GO:0045055 0.414665 Regulated exocytosis (1) 2.0 1.0
342 GO:2001243 0.411574 Negative regulation of intrinsic apoptotic signaling pathway (1) 2.0 1.0
510 GO:0042063 0.408279 Gliogenesis (1) 3.0 1.0
788 GO:0009410 0.407937 Response to xenobiotic stimulus (1) 2.0 1.0
824 GO:0071363 0.407544 Cellular response to growth factor stimulus (1) 3.0 1.0
496 GO:0048608 0.406699 Reproductive structure development (1) 2.0 1.0
528 GO:0007565 0.406022 Female pregnancy (1) 2.0 1.0
896 GO:0097193 0.400210 Intrinsic apoptotic signaling pathway (1) 3.0 1.0
76 GO:0001819 0.388417 Positive regulation of cytokine production (1) 2.0 1.0
48 GO:0001525 0.386728 Angiogenesis (1) 2.0 1.0
300 GO:0032386 0.384834 Regulation of intracellular transport (1) 2.0 1.0
906 GO:0043549 0.378711 Regulation of kinase activity (1) 3.0 0.0
662 GO:0031648 0.377411 Protein destabilization (1) 0.0 1.0
516 GO:0007423 0.372363 Sensory organ development (1) 3.0 1.0
461 GO:0050804 0.367884 Modulation of chemical synaptic transmission (1) 3.0 1.0
104 GO:0006469 0.364467 Negative regulation of protein kinase activity (1) 2.0 1.0
620 GO:0051098 0.363823 Regulation of binding (1) 3.0 1.0
86 GO:0072006 0.358531 Nephron development (1) 2.0 1.0
473 GO:0008584 0.350294 Male gonad development (1) 1.0 1.0
536 GO:0007610 0.346612 Behavior (1) 3.0 0.0
693 GO:1904646 0.344997 Cellular response to amyloid-beta (1) 0.0 1.0
376 GO:0007015 0.344878 Actin filament organization (1) 3.0 1.0
552 GO:0033365 0.343027 Protein localization to organelle (1) 3.0 1.0
774 GO:0030216 0.342203 Keratinocyte differentiation (1) 2.0 1.0
639 GO:0060326 0.341919 Cell chemotaxis (1) 2.0 1.0
154 GO:0050778 0.335649 Positive regulation of immune response (1) 3.0 0.0
480 GO:0048565 0.333289 Digestive tract development (1) 1.0 1.0
794 GO:0043434 0.332497 Response to peptide hormone (1) 3.0 1.0
651 GO:0050808 0.322913 Synapse organization (1) 3.0 1.0
772 GO:0060485 0.320964 Mesenchyme development (1) 3.0 1.0
406 GO:0048041 0.319689 Focal adhesion assembly (1) 1.0 1.0
698 GO:0070663 0.318299 Regulation of leukocyte proliferation (1) 2.0 1.0
804 GO:0030855 0.317904 Epithelial cell differentiation (1) 3.0 0.0
538 GO:0048266 0.308838 Behavioral response to pain (1) 0.0 1.0
225 GO:0006470 0.305848 Protein dephosphorylation (1) 3.0 1.0
719 GO:0043244 0.305739 Regulation of protein-containing complex disassembly (1) 2.0 1.0
725 GO:0070997 0.305130 Neuron death (1) 2.0 0.0
532 GO:0007596 0.304871 Blood coagulation (1) 3.0 1.0
806 GO:0051056 0.302828 Regulation of small gtpase mediated signal transduction (1) 3.0 1.0
433 GO:0097191 0.302625 Extrinsic apoptotic signaling pathway (1) 3.0 1.0
574 GO:0008285 0.300631 Negative regulation of cell population proliferation (1) 3.0 0.0
49 GO:0001569 0.300115 Branching involved in blood vessel morphogenesis (1) 0.0 1.0
926 GO:0060020 0.297104 Bergmann glial cell differentiation (1) 0.0 1.0
789 GO:0009416 0.295539 Response to light stimulus (1) 2.0 0.0
887 GO:0034504 0.294393 Protein localization to nucleus (1) 2.0 1.0
169 GO:0002764 0.292841 Immune response-regulating signaling pathway (1) 3.0 0.0
890 GO:0071353 0.290912 Cellular response to interleukin-4 (1) 1.0 1.0
505 GO:0007283 0.290337 Spermatogenesis (1) 2.0 1.0
513 GO:0030900 0.289030 Forebrain development (1) 3.0 1.0
241 GO:0006612 0.288796 Protein targeting to membrane (1) 1.0 1.0
644 GO:0071230 0.284292 Cellular response to amino acid stimulus (1) 1.0 1.0
712 GO:0033002 0.280632 Muscle cell proliferation (1) 2.0 1.0
317 GO:0006897 0.277768 Endocytosis (1) 3.0 1.0
230 GO:0030162 0.277751 Regulation of proteolysis (1) 3.0 0.0
607 GO:0042391 0.276380 Regulation of membrane potential (1) 3.0 0.0
689 GO:1905897 0.275961 Regulation of response to endoplasmic reticulum stress (1) 2.0 1.0
105 GO:0045860 0.275729 Positive regulation of protein kinase activity (1) 2.0 0.0
198 GO:0006260 0.275261 Dna replication (1) 3.0 1.0
548 GO:1903829 0.274893 Positive regulation of protein localization (1) 3.0 0.0
934 GO:0051258 0.274092 Protein polymerization (1) 3.0 1.0
378 GO:0031532 0.271109 Actin cytoskeleton reorganization (1) 1.0 1.0
216 GO:0045727 0.270882 Positive regulation of translation (1) 1.0 1.0
642 GO:0034599 0.270088 Cellular response to oxidative stress (1) 3.0 0.0
767 GO:0051146 0.266722 Striated muscle cell differentiation (1) 2.0 0.0
77 GO:0002718 0.262195 Regulation of cytokine production involved in immune response (1) 2.0 1.0
19 GO:0031109 0.261030 Microtubule polymerization or depolymerization (1) 2.0 1.0
584 GO:0040008 0.259597 Regulation of growth (1) 3.0 0.0
937 GO:0051640 0.257672 Organelle localization (1) 3.0 0.0
377 GO:0031032 0.257340 Actomyosin structure organization (1) 2.0 1.0
904 GO:0042113 0.254618 B cell activation (1) 3.0 0.0
133 GO:0043303 0.253768 Mast cell degranulation (1) 1.0 1.0
561 GO:0048511 0.251652 Rhythmic process (1) 3.0 1.0
243 GO:0006606 0.250144 Protein import into nucleus (1) 1.0 1.0
836 GO:1901987 0.249107 Regulation of cell cycle phase transition (1) 3.0 0.0
758 GO:0031099 0.247922 Regeneration (1) 2.0 1.0
739 GO:1902903 0.245529 Regulation of supramolecular fiber organization (1) 3.0 1.0
323 GO:0016236 0.245469 Macroautophagy (1) 3.0 1.0
478 GO:0048568 0.244894 Embryonic organ development (1) 3.0 0.0
103 GO:0042531 0.243583 Positive regulation of tyrosine phosphorylation of stat protein (1) 0.0 1.0
598 GO:0043254 0.240098 Regulation of protein-containing complex assembly (1) 3.0 1.0
864 GO:0030183 0.236405 B cell differentiation (1) 1.0 0.0
301 GO:0032388 0.235842 Positive regulation of intracellular transport (1) 1.0 0.0
692 GO:0010595 0.235403 Positive regulation of endothelial cell migration (1) 2.0 1.0
865 GO:0030217 0.234830 T cell differentiation (1) 3.0 0.0
20 GO:0070507 0.234661 Regulation of microtubule cytoskeleton organization (1) 2.0 1.0
111 GO:0031069 0.234249 Hair follicle morphogenesis (1) 0.0 1.0
363 GO:0051494 0.232679 Negative regulation of cytoskeleton organization (1) 2.0 1.0
226 GO:0035304 0.232643 Regulation of protein dephosphorylation (1) 2.0 1.0
913 GO:0090630 0.231968 Activation of gtpase activity (1) 0.0 1.0
733 GO:0030335 0.231192 Positive regulation of cell migration (1) 3.0 0.0
196 GO:0006357 0.230696 Regulation of transcription by rna polymerase ii (1) 3.0 1.0
524 GO:0007519 0.230417 Skeletal muscle tissue development (1) 2.0 1.0
53 GO:0001570 0.230416 Vasculogenesis (1) 1.0 1.0
26 GO:1901990 0.229708 Regulation of mitotic cell cycle phase transition (1) 2.0 0.0
33 GO:0000423 0.229518 Mitophagy (1) 1.0 1.0
615 GO:0035265 0.227576 Organ growth (1) 2.0 0.0
84 GO:0001822 0.227091 Kidney development (1) 3.0 0.0
151 GO:0006959 0.225795 Humoral immune response (1) 2.0 1.0
244 GO:0042307 0.225164 Positive regulation of protein import into nucleus (1) 0.0 1.0
24 GO:0007346 0.224934 Regulation of mitotic cell cycle (1) 3.0 0.0
162 GO:0060374 0.223666 Mast cell differentiation (1) 0.0 1.0
533 GO:0030168 0.219923 Platelet activation (1) 2.0 1.0
152 GO:0045087 0.218376 Innate immune response (1) 3.0 0.0
523 GO:0007517 0.217930 Muscle organ development (1) 3.0 0.0
138 GO:0002683 0.217495 Negative regulation of immune system process (1) 3.0 0.0
319 GO:0006909 0.217477 Phagocytosis (1) 2.0 1.0
18 GO:0000226 0.216530 Microtubule cytoskeleton organization (1) 3.0 0.0
622 GO:0043086 0.212387 Negative regulation of catalytic activity (1) 3.0 1.0
898 GO:0035924 0.211788 Cellular response to vascular endothelial growth factor stimulus (1) 2.0 1.0
187 GO:0071897 0.211315 Dna biosynthetic process (1) 2.0 0.0
517 GO:0043586 0.209986 Tongue development (1) 1.0 1.0
606 GO:0048638 0.209981 Regulation of developmental growth (1) 2.0 0.0
736 GO:0009617 0.209127 Response to bacterium (1) 3.0 0.0
691 GO:0043542 0.207450 Endothelial cell migration (1) 3.0 0.0
321 GO:0010507 0.205494 Negative regulation of autophagy (1) 1.0 1.0
149 GO:0050853 0.205417 B cell receptor signaling pathway (1) 1.0 0.0
885 GO:1900180 0.205300 Regulation of protein localization to nucleus (1) 1.0 1.0
907 GO:0051881 0.202707 Regulation of mitochondrial membrane potential (1) 1.0 0.0
171 GO:0003014 0.201637 Renal system process (1) 2.0 1.0
211 GO:0031507 0.200984 Heterochromatin assembly (1) 1.0 1.0
64 GO:0071456 0.200564 Cellular response to hypoxia (1) 1.0 0.0
694 GO:0032869 0.197755 Cellular response to insulin stimulus (1) 2.0 0.0
779 GO:0008544 0.196206 Epidermis development (1) 3.0 0.0
634 GO:0061024 0.193065 Membrane organization (1) 2.0 0.0
50 GO:0002040 0.192737 Sprouting angiogenesis (1) 1.0 0.0
610 GO:0031333 0.191284 Negative regulation of protein-containing complex assembly (1) 2.0 1.0
657 GO:0045165 0.191259 Cell fate commitment (1) 3.0 0.0
435 GO:0016055 0.190638 Wnt signaling pathway (1) 2.0 0.0
569 GO:0030307 0.190101 Positive regulation of cell growth (1) 2.0 1.0
200 GO:0006281 0.189187 Dna repair (1) 2.0 0.0
63 GO:0001666 0.188575 Response to hypoxia (1) 2.0 1.0
560 GO:0043473 0.188137 Pigmentation (1) 2.0 1.0
521 GO:0035051 0.187551 Cardiocyte differentiation (1) 2.0 1.0
690 GO:2001020 0.187450 Regulation of response to dna damage stimulus (1) 2.0 0.0
346 GO:0006936 0.187431 Muscle contraction (1) 3.0 1.0
328 GO:0043065 0.184805 Positive regulation of apoptotic process (1) 2.0 0.0
650 GO:0034329 0.183554 Cell junction assembly (1) 2.0 0.0
262 GO:0045429 0.183059 Positive regulation of nitric oxide biosynthetic process (1) 0.0 1.0
117 GO:0060562 0.183024 Epithelial tube morphogenesis (1) 2.0 0.0
583 GO:0032967 0.182852 Positive regulation of collagen biosynthetic process (1) 0.0 1.0
208 GO:0006325 0.182003 Chromatin organization (1) 3.0 0.0
209 GO:0006338 0.181855 Chromatin remodeling (1) 2.0 1.0
882 GO:0032147 0.181843 Activation of protein kinase activity (1) 1.0 0.0
7 GO:0000122 0.181449 Negative regulation of transcription by rna polymerase ii (1) 1.0 0.0
911 GO:0060416 0.180831 Response to growth hormone (1) 1.0 1.0
121 GO:0090050 0.180206 Positive regulation of cell migration involved in sprouting angiogenesis (1) 0.0 1.0
467 GO:0009791 0.179613 Post-embryonic development (1) 1.0 0.0
870 GO:0070527 0.179383 Platelet aggregation (1) 1.0 1.0
781 GO:0008625 0.179216 Extrinsic apoptotic signaling pathway via death domain receptors (1) 1.0 1.0
702 GO:0048146 0.179210 Positive regulation of fibroblast proliferation (1) 0.0 1.0
785 GO:0009266 0.178944 Response to temperature stimulus (1) 2.0 1.0
316 GO:0033157 0.178583 Regulation of intracellular protein transport (1) 1.0 0.0
472 GO:0001553 0.178541 Luteinization (1) 0.0 1.0
174 GO:0010613 0.176998 Positive regulation of cardiac muscle hypertrophy (1) 1.0 1.0
842 GO:0071407 0.176595 Cellular response to organic cyclic compound (1) 3.0 0.0
834 GO:0035195 0.176389 Gene silencing by mirna (1) 1.0 0.0
504 GO:0048709 0.176248 Oligodendrocyte differentiation (1) 2.0 1.0
54 GO:2001214 0.175971 Positive regulation of vasculogenesis (1) 0.0 1.0
600 GO:0010632 0.174413 Regulation of epithelial cell migration (1) 3.0 0.0
682 GO:0007026 0.174040 Negative regulation of microtubule depolymerization (1) 0.0 1.0
276 GO:0016567 0.173194 Protein ubiquitination (1) 3.0 0.0
881 GO:0031929 0.171840 Tor signaling (1) 2.0 1.0
52 GO:0001541 0.171713 Ovarian follicle development (1) 1.0 0.0
310 GO:0051924 0.171261 Regulation of calcium ion transport (1) 3.0 0.0
899 GO:0035994 0.170901 Response to muscle stretch (1) 1.0 1.0
32 GO:0000422 0.170612 Autophagy of mitochondrion (1) 2.0 1.0
703 GO:0048661 0.169791 Positive regulation of smooth muscle cell proliferation (1) 1.0 0.0
447 GO:0007173 0.169218 Epidermal growth factor receptor signaling pathway (1) 2.0 0.0
sum(probabilities_df["predictions"] ==1)
288
sum(probabilities_df["predictions"] ==0)
651

Probability < 0.5 doesn’t mean it does not belong to the class, a probability of for example 0.2 can represent a 1 (annotated to MoA)

Modify probabilities

Take into account the annotations each GO term has (general GO terms are easier to predict as they have more annotations)

For drug with unknown MOA…

sum_annotations = slim_matrix_single_neuron.T.sum()/slim_matrix_single_neuron.shape[1]
logits_apriori = np.log(sum_annotations/(1-sum_annotations))

logits_apost= np.log(probabilities_df["probability"]/(1-probabilities_df["probability"]))
delta_logits =logits_apost.to_numpy()- logits_apriori.to_numpy()
delta_logits_df = pd.DataFrame(delta_logits)
delta_logits_df.columns = ["delta_logits"]
probabilities_mod = probabilities_df.merge(delta_logits_df, left_index=True,right_index=True)
probabilities_mod.loc[probabilities_mod["predictions"] ==1].loc[probabilities_mod["layer_number"] <= 7].sort_values(by=["delta_logits"], ascending=False)
GO_term probability Name layer_number predictions delta_logits
839 GO:0031047 0.553888 Gene silencing by rna (1) 2.0 1.0 2.192458
662 GO:0031648 0.377411 Protein destabilization (1) 0.0 1.0 2.092834
106 GO:0071900 0.725062 Regulation of protein serine/threonine kinase activity (1) 2.0 1.0 1.944569
33 GO:0000423 0.229518 Mitophagy (1) 1.0 1.0 1.880007
538 GO:0048266 0.308838 Behavioral response to pain (1) 0.0 1.0 1.857031
913 GO:0090630 0.231968 Activation of gtpase activity (1) 0.0 1.0 1.793943
216 GO:0045727 0.270882 Positive regulation of translation (1) 1.0 1.0 1.746070
223 GO:0018108 0.783375 Peptidyl-tyrosine phosphorylation (1) 3.0 1.0 1.672826
719 GO:0043244 0.305739 Regulation of protein-containing complex disassembly (1) 2.0 1.0 1.646098
241 GO:0006612 0.288796 Protein targeting to membrane (1) 1.0 1.0 1.626841
104 GO:0006469 0.364467 Negative regulation of protein kinase activity (1) 2.0 1.0 1.593794
888 GO:0034976 0.540483 Response to endoplasmic reticulum stress (1) 3.0 1.0 1.575980
890 GO:0071353 0.290912 Cellular response to interleukin-4 (1) 1.0 1.0 1.575256
638 GO:0033554 0.884331 Cellular response to stress (1) 4.0 1.0 1.573975
820 GO:0033993 0.809556 Response to lipid (1) 3.0 1.0 1.551573
74 GO:0001817 0.687600 Regulation of cytokine production (1) 3.0 1.0 1.534715
682 GO:0007026 0.174040 Negative regulation of microtubule depolymerization (1) 0.0 1.0 1.533781
528 GO:0007565 0.406022 Female pregnancy (1) 2.0 1.0 1.478881
224 GO:0046777 0.661031 Protein autophosphorylation (1) 1.0 1.0 1.453821
544 GO:0060179 0.161090 Male mating behavior (1) 0.0 1.0 1.440901
32 GO:0000422 0.170612 Autophagy of mitochondrion (1) 2.0 1.0 1.409881
774 GO:0030216 0.342203 Keratinocyte differentiation (1) 2.0 1.0 1.406530
44 GO:0048812 0.672129 Neuron projection morphogenesis (1) 3.0 1.0 1.385004
472 GO:0001553 0.178541 Luteinization (1) 0.0 1.0 1.373326
77 GO:0002718 0.262195 Regulation of cytokine production involved in immune response (1) 2.0 1.0 1.372827
900 GO:0042060 0.612497 Wound healing (1) 4.0 1.0 1.325911
385 GO:0060632 0.143694 Regulation of microtubule-based movement (1) 1.0 1.0 1.306104
726 GO:0065003 0.654123 Protein-containing complex assembly (1) 4.0 1.0 1.304384
926 GO:0060020 0.297104 Bergmann glial cell differentiation (1) 0.0 1.0 1.288696
582 GO:1902459 0.140358 Positive regulation of stem cell population maintenance (1) 0.0 1.0 1.278722
63 GO:0001666 0.188575 Response to hypoxia (1) 2.0 1.0 1.276928
342 GO:2001243 0.411574 Negative regulation of intrinsic apoptotic signaling pathway (1) 2.0 1.0 1.262448
573 GO:0010629 0.742615 Negative regulation of gene expression (1) 3.0 1.0 1.216446
480 GO:0048565 0.333289 Digestive tract development (1) 1.0 1.0 1.203775
174 GO:0010613 0.176998 Positive regulation of cardiac muscle hypertrophy (1) 1.0 1.0 1.199403
676 GO:0030282 0.116755 Bone mineralization (1) 1.0 1.0 1.177409
49 GO:0001569 0.300115 Branching involved in blood vessel morphogenesis (1) 0.0 1.0 1.170620
899 GO:0035994 0.170901 Response to muscle stretch (1) 1.0 1.0 1.156962
443 GO:0035860 0.114067 Glial cell-derived neurotrophic factor receptor signaling pathway (1) 0.0 1.0 1.151083
847 GO:0045055 0.414665 Regulated exocytosis (1) 2.0 1.0 1.096844
134 GO:0002366 0.423235 Leukocyte activation involved in immune response (1) 3.0 1.0 1.076788
181 GO:0006139 0.875555 Nucleobase-containing compound metabolic process (1) 6.0 1.0 1.061941
689 GO:1905897 0.275961 Regulation of response to endoplasmic reticulum stress (1) 2.0 1.0 1.052784
38 GO:0000902 0.813929 Cell morphogenesis (1) 4.0 1.0 1.052132
654 GO:0090398 0.416328 Cellular senescence (1) 1.0 1.0 1.048427
693 GO:1904646 0.344997 Cellular response to amyloid-beta (1) 0.0 1.0 1.043235
100 GO:0033138 0.542770 Positive regulation of peptidyl-serine phosphorylation (1) 1.0 1.0 1.018799
745 GO:0009653 0.877370 Anatomical structure morphogenesis (1) 5.0 1.0 1.014649
198 GO:0006260 0.275261 Dna replication (1) 3.0 1.0 1.007970
133 GO:0043303 0.253768 Mast cell degranulation (1) 1.0 1.0 0.981406
368 GO:0060271 0.126831 Cilium assembly (1) 3.0 1.0 0.970313
285 GO:0051051 0.421500 Negative regulation of transport (1) 3.0 1.0 0.964316
626 GO:0051649 0.802013 Establishment of localization in cell (1) 4.0 1.0 0.957089
378 GO:0031532 0.271109 Actin cytoskeleton reorganization (1) 1.0 1.0 0.947004
651 GO:0050808 0.322913 Synapse organization (1) 3.0 1.0 0.943923
11 GO:0043406 0.445845 Positive regulation of map kinase activity (1) 1.0 1.0 0.939978
34 GO:1903146 0.103490 Regulation of autophagy of mitochondrion (1) 1.0 1.0 0.932003
262 GO:0045429 0.183059 Positive regulation of nitric oxide biosynthetic process (1) 0.0 1.0 0.911660
661 GO:0046326 0.165116 Positive regulation of glucose import (1) 0.0 1.0 0.907432
911 GO:0060416 0.180831 Response to growth hormone (1) 1.0 1.0 0.896690
571 GO:2000010 0.127151 Positive regulation of protein localization to cell surface (1) 0.0 1.0 0.888565
633 GO:0051301 0.535293 Cell division (1) 2.0 1.0 0.887200
483 GO:0035909 0.162013 Aorta morphogenesis (1) 1.0 1.0 0.884752
54 GO:2001214 0.175971 Positive regulation of vasculogenesis (1) 0.0 1.0 0.863534
416 GO:0035556 0.887824 Intracellular signal transduction (1) 4.0 1.0 0.862843
470 GO:0042733 0.097137 Embryonic digit morphogenesis (1) 0.0 1.0 0.861598
886 GO:0034502 0.132453 Protein localization to chromosome (1) 2.0 1.0 0.856780
111 GO:0031069 0.234249 Hair follicle morphogenesis (1) 0.0 1.0 0.832897
880 GO:0031667 0.374380 Response to nutrient levels (1) 4.0 1.0 0.819334
833 GO:0010467 0.880677 Gene expression (1) 5.0 1.0 0.817357
184 GO:0006275 0.102018 Regulation of dna replication (1) 2.0 1.0 0.816174
162 GO:0060374 0.223666 Mast cell differentiation (1) 0.0 1.0 0.815593
265 GO:0051247 0.818839 Positive regulation of protein metabolic process (1) 4.0 1.0 0.802282
821 GO:0034097 0.533600 Response to cytokine (1) 3.0 1.0 0.801775
408 GO:0007165 0.930074 Signal transduction (1) 6.0 1.0 0.801131
849 GO:0043966 0.142338 Histone h3 acetylation (1) 2.0 1.0 0.797379
112 GO:0060789 0.091132 Hair follicle placode formation (1) 0.0 1.0 0.791156
558 GO:0016032 0.500000 Viral process (1) 3.0 1.0 0.785929
151 GO:0006959 0.225795 Humoral immune response (1) 2.0 1.0 0.785159
86 GO:0072006 0.358531 Nephron development (1) 2.0 1.0 0.777591
83 GO:0002720 0.163684 Positive regulation of cytokine production involved in immune response (1) 1.0 1.0 0.776348
53 GO:0001570 0.230416 Vasculogenesis (1) 1.0 1.0 0.770097
532 GO:0007596 0.304871 Blood coagulation (1) 3.0 1.0 0.764501
567 GO:0051641 0.841801 Cellular localization (1) 5.0 1.0 0.761481
639 GO:0060326 0.341919 Cell chemotaxis (1) 2.0 1.0 0.758938
19 GO:0031109 0.261030 Microtubule polymerization or depolymerization (1) 2.0 1.0 0.746075
800 GO:0030521 0.102431 Androgen receptor signaling pathway (1) 1.0 1.0 0.729092
511 GO:0030182 0.721830 Neuron differentiation (1) 5.0 1.0 0.726501
806 GO:0051056 0.302828 Regulation of small gtpase mediated signal transduction (1) 3.0 1.0 0.724275
527 GO:0007528 0.109807 Neuromuscular junction development (1) 1.0 1.0 0.722229
794 GO:0043434 0.332497 Response to peptide hormone (1) 3.0 1.0 0.716778
99 GO:0001934 0.658374 Positive regulation of protein phosphorylation (1) 3.0 1.0 0.708241
896 GO:0097193 0.400210 Intrinsic apoptotic signaling pathway (1) 3.0 1.0 0.705651
624 GO:0010628 0.702924 Positive regulation of gene expression (1) 3.0 1.0 0.704419
722 GO:0042325 0.780828 Regulation of phosphorylation (1) 5.0 1.0 0.698909
752 GO:0043170 0.942545 Macromolecule metabolic process (1) 7.0 1.0 0.693446
517 GO:0043586 0.209986 Tongue development (1) 1.0 1.0 0.692360
182 GO:0016070 0.778891 Rna metabolic process (1) 5.0 1.0 0.687621
281 GO:0006811 0.500000 Ion transport (1) 6.0 1.0 0.686632
586 GO:2000773 0.121640 Negative regulation of cellular senescence (1) 0.0 1.0 0.685597
808 GO:1902532 0.449154 Negative regulation of intracellular signal transduction (1) 3.0 1.0 0.684968
570 GO:0045597 0.628072 Positive regulation of cell differentiation (1) 3.0 1.0 0.680798
76 GO:0001819 0.388417 Positive regulation of cytokine production (1) 2.0 1.0 0.679733
934 GO:0051258 0.274092 Protein polymerization (1) 3.0 1.0 0.677814
713 GO:0035726 0.096936 Common myeloid progenitor cell proliferation (1) 0.0 1.0 0.667842
386 GO:0007049 0.727375 Cell cycle (1) 6.0 1.0 0.665704
473 GO:0008584 0.350294 Male gonad development (1) 1.0 1.0 0.663186
813 GO:0051898 0.103239 Negative regulation of protein kinase b signaling (1) 0.0 1.0 0.653209
698 GO:0070663 0.318299 Regulation of leukocyte proliferation (1) 2.0 1.0 0.652091
729 GO:0120035 0.514954 Regulation of plasma membrane bounded cell projection organization (1) 3.0 1.0 0.631426
881 GO:0031929 0.171840 Tor signaling (1) 2.0 1.0 0.624585
496 GO:0048608 0.406699 Reproductive structure development (1) 2.0 1.0 0.619202
559 GO:0022414 0.641754 Reproductive process (1) 4.0 1.0 0.617772
562 GO:0050896 0.963328 Response to stimulus (1) 7.0 1.0 0.605804
381 GO:0008064 0.113103 Regulation of actin polymerization or depolymerization (1) 2.0 1.0 0.603158
855 GO:1903578 0.141298 Regulation of atp metabolic process (1) 1.0 1.0 0.602865
619 GO:0050790 0.825081 Regulation of catalytic activity (1) 4.0 1.0 0.598054
476 GO:0048714 0.112165 Positive regulation of oligodendrocyte differentiation (1) 0.0 1.0 0.593772
363 GO:0051494 0.232679 Negative regulation of cytoskeleton organization (1) 2.0 1.0 0.593448
652 GO:0042180 0.090402 Cellular ketone metabolic process (1) 3.0 1.0 0.590857
423 GO:1902533 0.530199 Positive regulation of intracellular signal transduction (1) 2.0 1.0 0.581066
319 GO:0006909 0.217477 Phagocytosis (1) 2.0 1.0 0.578885
353 GO:0006954 0.482633 Inflammatory response (1) 3.0 1.0 0.578336
406 GO:0048041 0.319689 Focal adhesion assembly (1) 1.0 1.0 0.577606
585 GO:0048589 0.599054 Developmental growth (1) 4.0 1.0 0.575879
461 GO:0050804 0.367884 Modulation of chemical synaptic transmission (1) 3.0 1.0 0.568933
711 GO:0019752 0.316625 Carboxylic acid metabolic process (1) 4.0 1.0 0.563481
629 GO:0051174 0.783163 Regulation of phosphorus metabolic process (1) 6.0 1.0 0.558257
82 GO:0032743 0.079738 Positive regulation of interleukin-2 production (1) 0.0 1.0 0.545266
510 GO:0042063 0.408279 Gliogenesis (1) 3.0 1.0 0.539129
730 GO:0031175 0.631148 Neuron projection development (1) 4.0 1.0 0.537147
741 GO:0016477 0.682158 Cell migration (1) 4.0 1.0 0.536651
663 GO:0050821 0.165950 Protein stabilization (1) 0.0 1.0 0.535214
343 GO:1902166 0.145254 Negative regulation of intrinsic apoptotic signaling pathway in response to dna damage by p53 class mediator (1) 0.0 1.0 0.525494
213 GO:0006396 0.268052 Rna processing (1) 4.0 1.0 0.523646
563 GO:1900272 0.063979 Negative regulation of long-term synaptic potentiation (1) 0.0 1.0 0.517858
36 GO:0000723 0.137533 Telomere maintenance (1) 1.0 1.0 0.515446
524 GO:0007519 0.230417 Skeletal muscle tissue development (1) 2.0 1.0 0.511692
93 GO:0001843 0.082584 Neural tube closure (1) 1.0 1.0 0.491845
226 GO:0035304 0.232643 Regulation of protein dephosphorylation (1) 2.0 1.0 0.490892
684 GO:0051770 0.158964 Positive regulation of nitric-oxide synthase biosynthetic process (1) 0.0 1.0 0.483868
583 GO:0032967 0.182852 Positive regulation of collagen biosynthetic process (1) 0.0 1.0 0.478922
641 GO:0071417 0.494543 Cellular response to organonitrogen compound (1) 3.0 1.0 0.475108
788 GO:0009410 0.407937 Response to xenobiotic stimulus (1) 2.0 1.0 0.474799
412 GO:0009966 0.816584 Regulation of signal transduction (1) 5.0 1.0 0.474348
735 GO:0046718 0.151222 Viral entry into host cell (1) 1.0 1.0 0.472175
266 GO:0030163 0.372430 Protein catabolic process (1) 4.0 1.0 0.453053
196 GO:0006357 0.230696 Regulation of transcription by rna polymerase ii (1) 3.0 1.0 0.447389
840 GO:0043154 0.084113 Negative regulation of cysteine-type endopeptidase activity involved in apoptotic process (1) 1.0 1.0 0.427212
299 GO:0030705 0.133167 Cytoskeleton-dependent intracellular transport (1) 3.0 1.0 0.424567
433 GO:0097191 0.302625 Extrinsic apoptotic signaling pathway (1) 3.0 1.0 0.420731
610 GO:0031333 0.191284 Negative regulation of protein-containing complex assembly (1) 2.0 1.0 0.417629
501 GO:0007507 0.480118 Heart development (1) 4.0 1.0 0.417365
901 GO:0042110 0.432222 T cell activation (1) 4.0 1.0 0.413843
260 GO:0006807 0.931522 Nitrogen compound metabolic process (1) 7.0 1.0 0.413085
785 GO:0009266 0.178944 Response to temperature stimulus (1) 2.0 1.0 0.412491
358 GO:0007005 0.427023 Mitochondrion organization (1) 3.0 1.0 0.412213
377 GO:0031032 0.257340 Actomyosin structure organization (1) 2.0 1.0 0.410068
484 GO:0007399 0.731066 Nervous system development (1) 6.0 1.0 0.409544
136 GO:0002376 0.760395 Immune system process (1) 6.0 1.0 0.409057
211 GO:0031507 0.200984 Heterochromatin assembly (1) 1.0 1.0 0.406540
20 GO:0070507 0.234661 Regulation of microtubule cytoskeleton organization (1) 2.0 1.0 0.406533
356 GO:0006996 0.787638 Organelle organization (1) 5.0 1.0 0.400534
376 GO:0007015 0.344878 Actin filament organization (1) 3.0 1.0 0.399824
173 GO:0003300 0.147997 Cardiac muscle hypertrophy (1) 2.0 1.0 0.399424
620 GO:0051098 0.363823 Regulation of binding (1) 3.0 1.0 0.394294
922 GO:0036324 0.117627 Vascular endothelial growth factor receptor-2 signaling pathway (1) 0.0 1.0 0.392320
487 GO:0030325 0.087339 Adrenal gland development (1) 0.0 1.0 0.389651
781 GO:0008625 0.179216 Extrinsic apoptotic signaling pathway via death domain receptors (1) 1.0 1.0 0.375455
608 GO:0043114 0.067682 Regulation of vascular permeability (1) 1.0 1.0 0.368329
103 GO:0042531 0.243583 Positive regulation of tyrosine phosphorylation of stat protein (1) 0.0 1.0 0.365635
102 GO:0050731 0.126350 Positive regulation of peptidyl-tyrosine phosphorylation (1) 2.0 1.0 0.364186
504 GO:0048709 0.176248 Oligodendrocyte differentiation (1) 2.0 1.0 0.355145
324 GO:0016241 0.118988 Regulation of macroautophagy (1) 2.0 1.0 0.349328
218 GO:0006468 0.771378 Protein phosphorylation (1) 5.0 1.0 0.348022
171 GO:0003014 0.201637 Renal system process (1) 2.0 1.0 0.341557
644 GO:0071230 0.284292 Cellular response to amino acid stimulus (1) 1.0 1.0 0.332289
321 GO:0010507 0.205494 Negative regulation of autophagy (1) 1.0 1.0 0.332037
533 GO:0030168 0.219923 Platelet activation (1) 2.0 1.0 0.322598
546 GO:0008104 0.599923 Protein localization (1) 5.0 1.0 0.318133
362 GO:0033043 0.644550 Regulation of organelle organization (1) 4.0 1.0 0.315090
851 GO:0070933 0.058559 Histone h4 deacetylation (1) 0.0 1.0 0.313671
460 GO:0023061 0.159325 Signal release (1) 4.0 1.0 0.312808
438 GO:0007179 0.169016 Transforming growth factor beta receptor signaling pathway (1) 1.0 1.0 0.304505
300 GO:0032386 0.384834 Regulation of intracellular transport (1) 2.0 1.0 0.296704
920 GO:0036092 0.063107 Phosphatidylinositol-3-phosphate biosynthetic process (1) 0.0 1.0 0.293434
505 GO:0007283 0.290337 Spermatogenesis (1) 2.0 1.0 0.287750
357 GO:0006997 0.062583 Nucleus organization (1) 2.0 1.0 0.284535
748 GO:0009056 0.627534 Catabolic process (1) 5.0 1.0 0.276960
48 GO:0001525 0.386728 Angiogenesis (1) 2.0 1.0 0.264850
541 GO:0008542 0.131766 Visual learning (1) 0.0 1.0 0.264386
98 GO:0001932 0.669289 Regulation of protein phosphorylation (1) 4.0 1.0 0.263137
734 GO:0051702 0.152779 Biological process involved in interaction with symbiont (1) 2.0 1.0 0.263093
243 GO:0006606 0.250144 Protein import into nucleus (1) 1.0 1.0 0.261493
513 GO:0030900 0.289030 Forebrain development (1) 3.0 1.0 0.257354
673 GO:0043392 0.109472 Negative regulation of dna binding (1) 1.0 1.0 0.255228
346 GO:0006936 0.187431 Muscle contraction (1) 3.0 1.0 0.250863
552 GO:0033365 0.343027 Protein localization to organelle (1) 3.0 1.0 0.239223
9 GO:0043408 0.476159 Regulation of mapk cascade (1) 2.0 1.0 0.238056
887 GO:0034504 0.294393 Protein localization to nucleus (1) 2.0 1.0 0.236099
598 GO:0043254 0.240098 Regulation of protein-containing complex assembly (1) 3.0 1.0 0.234151
280 GO:0006810 0.773180 Transport (1) 7.0 1.0 0.229528
267 GO:0045732 0.069485 Positive regulation of protein catabolic process (1) 2.0 1.0 0.220326
668 GO:0010976 0.084952 Positive regulation of neuron projection development (1) 1.0 1.0 0.216502
212 GO:0051090 0.131395 Regulation of dna-binding transcription factor activity (1) 2.0 1.0 0.215454
799 GO:0009743 0.121060 Response to carbohydrate (1) 2.0 1.0 0.214795
675 GO:0071277 0.074147 Cellular response to calcium ion (1) 0.0 1.0 0.211553
495 GO:0060976 0.094845 Coronary vasculature development (1) 1.0 1.0 0.210353
803 GO:0042475 0.125550 Odontogenesis of dentin-containing tooth (1) 2.0 1.0 0.208932
671 GO:0032092 0.120001 Positive regulation of protein binding (1) 1.0 1.0 0.204806
772 GO:0060485 0.320964 Mesenchyme development (1) 3.0 1.0 0.203758
121 GO:0090050 0.180206 Positive regulation of cell migration involved in sprouting angiogenesis (1) 0.0 1.0 0.202697
801 GO:0033143 0.078487 Regulation of intracellular steroid hormone receptor signaling pathway (1) 1.0 1.0 0.199503
870 GO:0070527 0.179383 Platelet aggregation (1) 1.0 1.0 0.197119
314 GO:0070588 0.238228 Calcium ion transmembrane transport (1) 4.0 1.0 0.196919
656 GO:0030154 0.829241 Cell differentiation (1) 6.0 1.0 0.193962
645 GO:0071300 0.128940 Cellular response to retinoic acid (1) 0.0 1.0 0.193772
897 GO:0035767 0.138830 Endothelial cell chemotaxis (1) 1.0 1.0 0.192330
332 GO:0097190 0.410188 Apoptotic signaling pathway (1) 4.0 1.0 0.189603
643 GO:0071222 0.143456 Cellular response to lipopolysaccharide (1) 2.0 1.0 0.189188
898 GO:0035924 0.211788 Cellular response to vascular endothelial growth factor stimulus (1) 2.0 1.0 0.184593
459 GO:0007267 0.506882 Cell-cell signaling (1) 5.0 1.0 0.184371
323 GO:0016236 0.245469 Macroautophagy (1) 3.0 1.0 0.183750
317 GO:0006897 0.277768 Endocytosis (1) 3.0 1.0 0.178141
244 GO:0042307 0.225164 Positive regulation of protein import into nucleus (1) 0.0 1.0 0.177873
728 GO:0030032 0.140303 Lamellipodium assembly (1) 1.0 1.0 0.163285
640 GO:0071310 0.661841 Cellular response to organic substance (1) 4.0 1.0 0.156044
686 GO:0097009 0.055448 Energy homeostasis (1) 0.0 1.0 0.155920
692 GO:0010595 0.235403 Positive regulation of endothelial cell migration (1) 2.0 1.0 0.154755
309 GO:0034765 0.277608 Regulation of ion transmembrane transport (1) 4.0 1.0 0.153881
770 GO:0009887 0.516411 Animal organ morphogenesis (1) 4.0 1.0 0.152678
209 GO:0006338 0.181855 Chromatin remodeling (1) 2.0 1.0 0.147943
547 GO:0032880 0.479479 Regulation of protein localization (1) 4.0 1.0 0.144928
918 GO:0046854 0.054745 Phosphatidylinositol phosphate biosynthetic process (1) 1.0 1.0 0.142413
329 GO:0043066 0.610305 Negative regulation of apoptotic process (1) 4.0 1.0 0.132954
824 GO:0071363 0.407544 Cellular response to growth factor stimulus (1) 3.0 1.0 0.122808
603 GO:0061045 0.068278 Negative regulation of wound healing (1) 2.0 1.0 0.122775
225 GO:0006470 0.305848 Protein dephosphorylation (1) 3.0 1.0 0.111956
388 GO:0051726 0.483775 Regulation of cell cycle (1) 5.0 1.0 0.109432
516 GO:0007423 0.372363 Sensory organ development (1) 3.0 1.0 0.106514
261 GO:0051171 0.836045 Regulation of nitrogen compound metabolic process (1) 6.0 1.0 0.100916
375 GO:0051496 0.076120 Positive regulation of stress fiber assembly (1) 0.0 1.0 0.097110
758 GO:0031099 0.247922 Regeneration (1) 2.0 1.0 0.096133
55 GO:0001649 0.136472 Osteoblast differentiation (1) 1.0 1.0 0.091104
227 GO:0032516 0.155024 Positive regulation of phosphoprotein phosphatase activity (1) 0.0 1.0 0.090968
191 GO:0045944 0.420524 Positive regulation of transcription by rna polymerase ii (1) 2.0 1.0 0.084844
283 GO:0051049 0.611523 Regulation of transport (1) 5.0 1.0 0.084359
739 GO:1902903 0.245529 Regulation of supramolecular fiber organization (1) 3.0 1.0 0.083258
569 GO:0030307 0.190101 Positive regulation of cell growth (1) 2.0 1.0 0.078819
679 GO:0042310 0.060677 Vasoconstriction (1) 1.0 1.0 0.075349
405 GO:0007159 0.162103 Leukocyte cell-cell adhesion (1) 3.0 1.0 0.074992
566 GO:0032879 0.660318 Regulation of localization (1) 6.0 1.0 0.074219
568 GO:0008284 0.513916 Positive regulation of cell population proliferation (1) 2.0 1.0 0.073069
561 GO:0048511 0.251652 Rhythmic process (1) 3.0 1.0 0.067632
702 GO:0048146 0.179210 Positive regulation of fibroblast proliferation (1) 0.0 1.0 0.067004
403 GO:0033628 0.059919 Regulation of cell adhesion mediated by integrin (1) 1.0 1.0 0.061975
885 GO:1900180 0.205300 Regulation of protein localization to nucleus (1) 1.0 1.0 0.060202
228 GO:0006508 0.347532 Proteolysis (1) 4.0 1.0 0.056727
560 GO:0043473 0.188137 Pigmentation (1) 2.0 1.0 0.036612
777 GO:0050680 0.103322 Negative regulation of epithelial cell proliferation (1) 2.0 1.0 0.036383
751 GO:0046034 0.094236 Atp metabolic process (1) 2.0 1.0 0.034832
591 GO:0010941 0.705559 Regulation of cell death (1) 5.0 1.0 0.026616
818 GO:0010243 0.557617 Response to organonitrogen compound (1) 4.0 1.0 0.022040
724 GO:0036473 0.154959 Cell death in response to oxidative stress (1) 2.0 1.0 0.021429
153 GO:0050776 0.424435 Regulation of immune response (1) 4.0 1.0 0.011047
521 GO:0035051 0.187551 Cardiocyte differentiation (1) 2.0 1.0 0.003905
194 GO:0006355 0.544373 Regulation of transcription, dna-templated (1) 4.0 1.0 0.003607
622 GO:0043086 0.212387 Negative regulation of catalytic activity (1) 3.0 1.0 -0.003920
494 GO:0060840 0.094489 Artery development (1) 2.0 1.0 -0.013518
936 GO:0051000 0.059868 Positive regulation of nitric-oxide synthase activity (1) 0.0 1.0 -0.017649
382 GO:0030041 0.093571 Actin filament polymerization (1) 2.0 1.0 -0.024295
712 GO:0033002 0.280632 Muscle cell proliferation (1) 2.0 1.0 -0.031117
750 GO:0044281 0.305872 Small molecule metabolic process (1) 5.0 1.0 -0.053702
257 GO:0046488 0.135113 Phosphatidylinositol metabolic process (1) 2.0 1.0 -0.069795
731 GO:0031529 0.036028 Ruffle organization (1) 1.0 1.0 -0.085835
469 GO:0060173 0.072132 Limb development (1) 1.0 1.0 -0.088184
296 GO:0015031 0.292221 Protein transport (1) 4.0 1.0 -0.098692
927 GO:0042632 0.034870 Cholesterol homeostasis (1) 0.0 1.0 -0.119705
78 GO:0032760 0.100402 Positive regulation of tumor necrosis factor production (1) 0.0 1.0 -0.132748
856 GO:0019722 0.034146 Calcium-mediated signaling (1) 2.0 1.0 -0.141431
288 GO:0032940 0.217398 Secretion by cell (1) 5.0 1.0 -0.147194
705 GO:0051353 0.082787 Positive regulation of oxidoreductase activity (1) 1.0 1.0 -0.158575
176 GO:0044262 0.129234 Cellular carbohydrate metabolic process (1) 3.0 1.0 -0.190099
180 GO:0019318 0.046251 Hexose metabolic process (1) 2.0 1.0 -0.211363
440 GO:0030512 0.045444 Negative regulation of transforming growth factor beta receptor signaling pathway (1) 0.0 1.0 -0.229806
331 GO:0071887 0.072964 Leukocyte apoptotic process (1) 2.0 1.0 -0.295526
823 GO:0045471 0.068117 Response to ethanol (1) 1.0 1.0 -0.639920
232 GO:0010951 0.025246 Negative regulation of endopeptidase activity (1) 2.0 1.0 -0.917276
names2 = list(probabilities_mod.loc[probabilities_mod["predictions"] ==1].loc[probabilities_mod["layer_number"] <=7].sort_values(by=["delta_logits"], ascending=False)["Name"].head(30))
terms2 = list(probabilities_mod.loc[probabilities_mod["predictions"] ==1].loc[probabilities_mod["layer_number"] <=7].sort_values(by=["delta_logits"], ascending=False)["GO_term"].head(30))
logits2 = list(probabilities_mod.loc[probabilities_mod["predictions"] ==1].loc[probabilities_mod["layer_number"] <=7].sort_values(by=["delta_logits"], ascending=False)["delta_logits"].head(30))

names2 = [x[:-4] for x in names2] 
for i in range(0,len(names2)):
    print(terms2[i],names2[i],logits2[i])
GO:0031047 Gene silencing by rna 2.192457619336144
GO:0031648 Protein destabilization 2.092833916210919
GO:0071900 Regulation of protein serine/threonine kinase activity 1.944569179670069
GO:0000423 Mitophagy 1.8800065969407627
GO:0048266 Behavioral response to pain 1.8570307139212263
GO:0090630 Activation of gtpase activity 1.793943480641404
GO:0045727 Positive regulation of translation 1.7460704310285706
GO:0018108 Peptidyl-tyrosine phosphorylation 1.6728257190053135
GO:0043244 Regulation of protein-containing complex disassembly 1.6460980615310405
GO:0006612 Protein targeting to membrane 1.6268405228374492
GO:0006469 Negative regulation of protein kinase activity 1.5937943848967007
GO:0034976 Response to endoplasmic reticulum stress 1.575980420814227
GO:0071353 Cellular response to interleukin-4 1.5752555355925209
GO:0033554 Cellular response to stress 1.573975479674923
GO:0033993 Response to lipid 1.5515734212063519
GO:0001817 Regulation of cytokine production 1.5347154822720621
GO:0007026 Negative regulation of microtubule depolymerization 1.5337813437727923
GO:0007565 Female pregnancy 1.4788810435976616
GO:0046777 Protein autophosphorylation 1.453821209026271
GO:0060179 Male mating behavior 1.4409006464912832
GO:0000422 Autophagy of mitochondrion 1.4098807478143418
GO:0030216 Keratinocyte differentiation 1.406529668352229
GO:0048812 Neuron projection morphogenesis 1.3850039160074457
GO:0001553 Luteinization 1.3733255324441331
GO:0002718 Regulation of cytokine production involved in immune response 1.3728274508963876
GO:0042060 Wound healing 1.325910525701464
GO:0060632 Regulation of microtubule-based movement 1.3061039503079361
GO:0065003 Protein-containing complex assembly 1.304383769510548
GO:0060020 Bergmann glial cell differentiation 1.2886961413918279
GO:1902459 Positive regulation of stem cell population maintenance 1.2787216247479065
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# set font
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Roboto'

# set the style of the axes and the text color
plt.rcParams['axes.edgecolor']='#333F4B'
plt.rcParams['axes.linewidth']=0.8
plt.rcParams['xtick.color']='#333F4B'
plt.rcParams['ytick.color']='#333F4B'
plt.rcParams['text.color']='#333F4B'


# create some fake data
percentages = pd.Series(logits2, 
                        index=names2)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')

# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))

fig, ax = plt.subplots(figsize=(4,17))

# create for each expense type an horizontal line that starts at x = 0 with the length 
# represented by the specific expense percentage value.
plt.hlines(y=my_range, xmin=0, xmax=df['percentage'], color='#208EA3', alpha=0.2, linewidth=14)

# create for each expense type a dot at the level of the expense percentage value
plt.plot(df['percentage'], my_range, "o", markersize=14, color='#208EA3', alpha=0.8)

# set labels
ax.set_xlabel(' Δlogit', fontsize=25, fontweight='black', color = '#36382E')
ax.set_ylabel('')
ax.set_facecolor(color="white")
ax.set_alpha(1)

# set axis
ax.tick_params(axis='both', which='major', labelsize=30)
plt.yticks(my_range, df.index)

# add an horizonal label for the y axis 
fig.text(-0.58, 0.862, 'MoA (GO terms)', fontsize=27, fontweight='black', color = '#36382E')
fig.text(0.2, 0.9, selected_drug_u_name.capitalize(), fontsize=30, fontweight='black', color = '#36382E')


# change the style of the axis spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.spines['left'].set_bounds((1, len(my_range)))
ax.set_xlim(0,max(logits2)+0.1)

ax.spines['left'].set_position(('outward', 8))
ax.spines['bottom'].set_position(('outward', 5))

plt.savefig(resultsdir+selected_drug_u_name+'_top_terms.png', dpi=300, bbox_inches='tight')

For known drug…

display(combobox)
selected_drug_name = combobox.result
# LOS LOGITS DE TEST!!
train_drug_logs = pd.DataFrame(platt_matrix.loc[:,selected_drug_name]).reset_index()
train_drug_logs.columns  = ["GO_term","probability"]
train_drug_logs = train_drug_logs.merge(real_go_info_svm, on="GO_term")
train_drug_logs.loc[train_drug_logs["layer_number"] <=3].sort_values(by=["probability"], ascending=False).head(30)
# # For known drugs
len((set(train_drug_logs.loc[train_drug_logs["layer_number"] <=3].sort_values(by=["probability"], ascending=False).head(30)["GO_term"])).intersection(set(pd.DataFrame(compounds_GOterms_matches[selected_drug_name])[1])))
30
ax = sns.boxplot(x=slim_matrix_single_neuron.loc[train_drug_logs["GO_term"],selected_drug_name], y=train_drug_logs.set_index("GO_term")["probability"], data=plot,showfliers=True )

# same as before
sum_annotations = slim_matrix_single_neuron.T.sum()/slim_matrix_single_neuron.shape[1]
logits_apriori= np.log(sum_annotations/(1-sum_annotations))
logits_apost= np.log(train_drug_logs["probability"]/(1-train_drug_logs["probability"]))
delta_logits = logits_apost.to_numpy()-logits_apriori.to_numpy()
delta_logits_df = pd.DataFrame(delta_logits)
delta_logits_df.columns = ["delta_logits"]
train_drug_mod = train_drug_logs.merge(delta_logits_df, left_index=True,right_index=True)
train_drug_mod.loc[train_drug_mod["layer_number"] <=3].sort_values(by=["delta_logits"], ascending=False).head(30)
GO_term probability Name layer_number delta_logits
578 GO:2000379 0.603184 Positive regulation of reactive oxygen species metabolic process (1) 1.0 2.770126
253 GO:0043552 0.735601 Positive regulation of phosphatidylinositol 3-kinase activity (1) 0.0 2.707570
80 GO:0010575 0.438910 Positive regulation of vascular endothelial growth factor production (1) 0.0 2.282492
633 GO:0051301 0.817491 Cell division (1) 2.0 2.245231
224 GO:0046777 0.780375 Protein autophosphorylation (1) 1.0 2.053782
423 GO:1902533 0.825532 Positive regulation of intracellular signal transduction (1) 2.0 2.014408
458 GO:0035025 0.374269 Positive regulation of rho protein signal transduction (1) 0.0 1.952270
848 GO:0071670 0.345148 Smooth muscle cell chemotaxis (1) 0.0 1.887644
348 GO:0006939 0.412178 Smooth muscle contraction (1) 2.0 1.842256
350 GO:0045987 0.348868 Positive regulation of smooth muscle contraction (1) 1.0 1.783402
908 GO:0051899 0.591286 Membrane depolarization (1) 2.0 1.675960
653 GO:0072593 0.631785 Reactive oxygen species metabolic process (1) 3.0 1.650121
115 GO:0060312 0.275354 Regulation of blood vessel remodeling (1) 0.0 1.625762
926 GO:0060020 0.369360 Bergmann glial cell differentiation (1) 0.0 1.614859
923 GO:0048170 0.295742 Positive regulation of long-term neuronal synaptic plasticity (1) 0.0 1.598558
99 GO:0001934 0.823688 Positive regulation of protein phosphorylation (1) 3.0 1.593723
853 GO:0038083 0.500000 Peptidyl-tyrosine autophosphorylation (1) 0.0 1.588712
713 GO:0035726 0.210551 Common myeloid progenitor cell proliferation (1) 0.0 1.577983
445 GO:0048008 0.414654 Platelet-derived growth factor receptor signaling pathway (1) 1.0 1.552362
857 GO:0035584 0.298388 Calcium-mediated signaling using intracellular calcium source (1) 0.0 1.496390
333 GO:1904019 0.457451 Epithelial cell apoptotic process (1) 1.0 1.481165
933 GO:0051150 0.337253 Regulation of smooth muscle cell differentiation (1) 1.0 1.474264
352 GO:0014827 0.254529 Intestine smooth muscle contraction (1) 0.0 1.453480
814 GO:0090037 0.235572 Positive regulation of protein kinase c signaling (1) 0.0 1.416277
894 GO:0048017 0.609803 Inositol lipid-mediated signaling (1) 1.0 1.399588
506 GO:0007286 0.240113 Spermatid development (1) 1.0 1.376021
742 GO:0035733 0.239304 Hepatic stellate cell activation (1) 0.0 1.371582
10 GO:0051403 0.429898 Stress-activated mapk cascade (1) 2.0 1.369506
702 GO:0048146 0.440487 Positive regulation of fibroblast proliferation (1) 0.0 1.349525
782 GO:1902042 0.289337 Negative regulation of extrinsic apoptotic signaling pathway via death domain receptors (1) 0.0 1.347889
ax = sns.boxplot(x=slim_matrix_single_neuron.loc[train_drug_mod["GO_term"],selected_drug_name], y=train_drug_mod.set_index("GO_term")["delta_logits"], data=plot,showfliers=True)

SVM GO TERM 2D representation

from sklearn.manifold import TSNE
import plotly.express as px

Choose go to study…

display(combobox_go)
selected_goterm = combobox_go.result
real_go_info[real_go_info["GO_term"]==selected_goterm+"_1"]
GO_term Name layer_number
4338 GO:0071353_1 Cellular response to interleukin-4 (1) 1.0
list_nodes = []
for i in range(1,7):
    list_nodes.append(selected_goterm+"_"+str(i))

score = attribution_data_annotated.loc[list_nodes].T
score_mod = score.divide(score.std()).fillna(0) 
annotations  =slim_matrix_single_neuron.loc[selected_goterm,]
y_predicted = models_svm[selected_goterm].predict(score_mod.astype(float))

Plot SVM

View statistics of GOterm

“Perfect” model (with train data)

auc = metrics.roc_auc_score(annotations, models_svm[selected_goterm].decision_function(score_mod.astype(float)))
cnf_matrix = metrics.confusion_matrix(annotations,y_predicted)
print(cnf_matrix)

print("Accuracy:",metrics.accuracy_score(annotations, y_predicted))
print("Precision:",metrics.precision_score(annotations,y_predicted)) # TP / (TP+FP)
print("Recall:",metrics.recall_score(annotations, y_predicted)) #TP / (TP+FN)
print("AUC with score:",auc) 
[[206   6]
 [  2  16]]
Accuracy: 0.9652173913043478
Precision: 0.7272727272727273
Recall: 0.8888888888888888
AUC with score: 0.9855870020964361

TN - FP

FN - TP

En mi opinion interesa mucho el precision, prefiero que haya menos FP no??

Test statistics…

auc = metrics.roc_auc_score(slim_matrix_single_neuron.loc[selected_goterm],  platt_matrix.loc[selected_goterm])
cnf_matrix = metrics.confusion_matrix(slim_matrix_single_neuron.loc[selected_goterm], preds_svm_matrix.loc[selected_goterm])
print(cnf_matrix)

print("Accuracy:",metrics.accuracy_score(slim_matrix_single_neuron.loc[selected_goterm], preds_svm_matrix.loc[selected_goterm]))
print("Precision:",metrics.precision_score(slim_matrix_single_neuron.loc[selected_goterm], preds_svm_matrix.loc[selected_goterm]))
print("Recall:",metrics.recall_score(slim_matrix_single_neuron.loc[selected_goterm], preds_svm_matrix.loc[selected_goterm])) #TP / (TP+FN)
print("AUC with score:",auc) #TP / (TP+FN)
[[203   9]
 [  4  14]]
Accuracy: 0.9434782608695652
Precision: 0.6086956521739131
Recall: 0.7777777777777778
AUC with score: 0.9095911949685536
import colorlover as cl
matrix = metrics.confusion_matrix(annotations,y_predicted)
tn, fp, fn, tp = matrix.ravel()

values = [tp, fn, fp, tn]
label_text = ["True Positive", "False Negative", "False Positive", "True Negative"]
labels = ["<b>TP</b>", "<b>FN</b>", "<b>FP</b>", "<b>TN</b>"]
blue = cl.flipper()["seq"]["9"]["Blues"]
red = cl.flipper()["seq"]["9"]["Reds"]
colors = ["#ff3700","#FFA0A0", "#CCE9FF",  "#0b8bff"]
trace0 = go.Pie(
    labels=label_text,
    values=values,
    hoverinfo="label+value+percent",
    textinfo="text+value",
    text=labels,
    sort=False,
    marker=dict(colors=colors),
    insidetextfont={"color": "#36382E"},
    rotation=90,
)

layout = go.Layout(
    title=dict(text="Confusion Matrix",
              x=0.3,
              y=0.8,
              font=dict(size=14),
              xanchor='center',
              yanchor='top'),
    #margin=dict(l=50, r=50, t=100, b=10),
    legend=dict(font={"color": "#36382E"}, orientation="h",x=0.1, y=-0.03),
#    plot_bgcolor="#282b38",
#    paper_bgcolor="#282b38",
    font=dict(family='Roboto',color= "#36382E"),
)

data = [trace0]
figure = go.Figure(data=data, layout=layout)
figure
y_test=annotations
decision_test=y_predicted
fpr, tpr, threshold = metrics.roc_curve(y_test, decision_test)

# AUC Score
auc_score = metrics.roc_auc_score(y_true=y_test, y_score=decision_test)

trace0 = go.Scatter(
    x=fpr, y=tpr, mode="lines", name="Test Data", marker={"color": "#ff3700"}
)

layout = go.Layout(
    title=dict(text=f"ROC Curve (AUC = {auc_score:.3f})",
            x=0.6,
            y=0.5,
            font=dict(size=20)
              ),
    xaxis=dict(title="False Positive Rate", gridcolor="white"),
    yaxis=dict(title="True Positive Rate", gridcolor="white"),
    legend=dict(x=0, y=1.05, orientation="h"),
    margin=dict(l=100, r=10, t=25, b=40),
#    plot_bgcolor="#282b38",
#    paper_bgcolor="#282b38",
    font=dict(family='Roboto',color= "#36382E"),
)

data = [trace0]
figure = go.Figure(data=data, layout=layout)
figure

Plot SVM with unknown labels

Voronoi Tessellation

What is a Voronoi Tessellation? Given a set P := {p1, …, pn} of sites, a Voronoi Tessellation is a subdivision of the space into n cells, one for each site in P, with the property that a point q lies in the cell corresponding to a site pi iff d(pi, q) < d(pj, q) for i distinct from j. The segments in a Voronoi Tessellation correspond to all points in the plane equidistant to the two nearest sites. Voronoi Tessellations have applications in computer science.

https://stackoverflow.com/questions/61225052/svm-plot-decision-surface-when-working-with-more-than-2-features

tsne = TSNE(n_components=2, verbose=0,
           init="pca",
            perplexity=30,
            random_state=123
           )
z = tsne.fit_transform(score_mod.astype(float)) 
list_nodes = list(models_svm[selected_goterm].feature_names_in_) # Extract the feature names from the model (those are the attributions we need)
score_unknown = attribution_data_all.loc[list_nodes,unknown].T
score_unknown_mod = score_unknown.divide(score.std()).fillna(0) # normalize
y_unknown = np.full(score_unknown_mod.shape[0],2) # 2=unknown MOA
y_pred_unknown = models_svm[selected_goterm].predict(score_unknown_mod.astype(float))
# join scores and annotations from known and unknown drugs
all_score = pd.concat([score_mod,score_unknown_mod])
all_y = np.concatenate((annotations,y_unknown))  # 2=unknown MOA

Plot T-SNE SVM

from sklearn.neighbors._classification import KNeighborsClassifier
# https://github.com/plotly/dash-sample-apps/blob/main/apps/dash-svm/utils/dash_reusable_components.py
z = tsne.fit_transform(all_score.astype(float)) 
df = pd.DataFrame()
df["y"] = all_y
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]
df["name"] =list(all_score.index)
df = df.sort_values(by=['y'])
df["y"] = df["y"].astype(str)
X,y = all_score.astype(float), all_y
y_predicted = models_svm[selected_goterm].predict(X)

resolution = 300 # 100x100 background pixels
X2d_xmin, X2d_xmax = np.min(z[:,0])-1, np.max(z[:,0])+1
X2d_ymin, X2d_ymax = np.min(z[:,1])-1, np.max(z[:,1])+1
xx, yy = np.meshgrid(np.linspace(X2d_xmin, X2d_xmax, resolution), np.linspace(X2d_ymin, X2d_ymax, resolution))

# approximate Voronoi tesselation on resolution x resolution grid using 1-NN
background_model = KNeighborsClassifier(n_neighbors=1).fit(z, y_predicted) 
voronoiBackground = background_model.predict(np.c_[xx.ravel(), yy.ravel()])
voronoiBackground = voronoiBackground.reshape((resolution, resolution))
go_name=real_go_info[real_go_info["GO_term"]==selected_goterm+"_1"]["Name"].values[0][:-4]
go_name
'Cellular response to interleukin-4'
bright_cscale = [[0, "#0b8bff"], [0.5, "#ff3700"],[1, "#36382E"]]
new_cscale = [[0, "#CCE9FF"], [1, "#FFA0A0"]]

trace0 = go.Contour(
        x=xx.flatten(),
        y=yy.flatten(),
        z=voronoiBackground.flatten(),
        hoverinfo="none",
        showscale=False,
        contours=dict(showlines=False),
        colorscale=new_cscale,
        opacity=0.9,
    )
    
trace1 = go.Contour(
    x=xx.flatten(),
    y=yy.flatten(),
    z=voronoiBackground.flatten(),
    showscale=False,
    hoverinfo="none",
    colorscale=new_cscale,
    line=dict(color="#ff3700"),
    )

trace2 = go.Scatter(
    x=df["comp-1"],
    y=df["comp-2"],
    mode="markers",
    text=df["name"].to_numpy(),
    marker=dict(size=7, color=df["y"].to_numpy(int),colorscale=bright_cscale),
    showlegend=False
)

legend1 = go.Scatter(
    x=[None],
    y=[None],
    mode="markers",
    name="Not annotated to<br>"+selected_goterm,
    marker=dict(size=7, color="#0b8bff",symbol='circle'),
)

legend2 = go.Scatter(
    x=[None],
    y=[None],
    mode="markers",
    name="Drug annotated to<br>"+selected_goterm,
    marker=dict(size=7, color="#ff3700",symbol='circle'),
)

legend3 = go.Scatter(
    x=[None],
    y=[None],
    mode="markers",
    name="Unknown MOA<br>annotations",
    marker=dict(size=7, color="#36382E",symbol='circle'),
)

    
layout = go.Layout(
   title=dict(text="<b>"+selected_goterm+"</b> "+go_name,
              x=0.5,
              y=0.92,
              font=dict(size=18),
              xanchor='center',
              yanchor='top'),
    xaxis=dict(ticks="", showticklabels=False, showgrid=False, zeroline=False),
    yaxis=dict(ticks="", showticklabels=False, showgrid=False, zeroline=False),
    yaxis_range=[min(yy.flatten()),max(yy.flatten())],
    xaxis_range=[min(xx.flatten()),max(xx.flatten())],
    legend=dict(x=0, y=0, orientation="h",font=dict(size=14)),
    paper_bgcolor='rgba(0,0,0,0)',
    width=600, height=800,
    font=dict(family='Roboto',color= "#36382E",size=15)
    )
data = [trace0,trace1,trace2,legend2,legend1,legend3]
figure = go.Figure(data=data,layout=layout)




figure